Spaces:

kushalpatel0265
/

simppl

Sleeping

App Files Files Community

kushalpatel0265 commited on Mar 20

Commit

def6165

verified ·

1 Parent(s): 17c9f49

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -196

app.py CHANGED Viewed

@@ -33,7 +33,6 @@ def load_raw_data(filepath):
 DATA_PATH = "data.jsonl"
 if not os.path.exists(DATA_PATH):
     st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
-    st.stop()
 else:
     raw_df = load_raw_data(DATA_PATH)
@@ -68,7 +67,7 @@ elif "title" in df.columns:
 else:
     text_col = None
-# For hashtags: if not provided, extract them from text using regex.
 if "hashtags" not in df.columns:
     def extract_hashtags(row):
         text = ""
@@ -106,7 +105,6 @@ if timestamp_col in df.columns:
         end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
         if start_date > end_date:
             st.sidebar.error("Error: End date must fall after start date.")
-        # Filter df between selected dates
         df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
     except Exception as e:
         st.sidebar.error("Error processing the timestamp column for filtering.")
@@ -144,90 +142,15 @@ if timestamp_col in df.columns:
     df["date"] = df[timestamp_col].dt.date
     time_series = df.groupby("date").size().reset_index(name="count")
     time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
-    fig_time = px.line(
-        time_series, x="date", y=["count", "7-day Moving Avg"],
-        labels={"date": "Date", "value": "Number of Posts"},
-        title="Posts Over Time with 7-day Moving Average"
-    )
     st.plotly_chart(fig_time)
 else:
     st.info("No timestamp data available for time series plot.")
-# --------------------------------------------------------------------------------
-# --------------------------- Network Diagram (Above Pie) -------------------------
-# --------------------------------------------------------------------------------
-"""
-We'll create a user <-> community network from the top users and top subreddits.
-For simplicity, we only include each user/subreddit once to avoid extremely large networks.
-"""
-st.markdown("### Network Diagram")
 community_col = "subreddit" if "subreddit" in df.columns else user_col
-# Build a small network of user->community edges
-if community_col in df.columns and user_col in df.columns:
-    # Let's focus on top communities
-    top_communities_df = df[community_col].value_counts().nlargest(5)  # top 5 subreddits or communities
-    top_communities = set(top_communities_df.index)
-    # For each row, if subreddit in top_communities, link author->subreddit
-    # For performance, take a sample of the entire dataset or filter only relevant rows.
-    sub_df = df[df[community_col].isin(top_communities)].copy()
-    sub_df = sub_df.dropna(subset=[user_col, community_col])
-    sub_df = sub_df.sample(min(500, len(sub_df)), random_state=42)  # sample to reduce network size
-    net = Network(height="600px", width="100%", notebook=False, bgcolor="#ffffff", font_color="black")
-    # We'll track which nodes we've added to avoid duplicates
-    added_users = set()
-    added_comms = set()
-    for _, row in sub_df.iterrows():
-        user = str(row[user_col])
-        comm = str(row[community_col])
-        if user not in added_users:
-            net.add_node(user, label=user, color="#FFAAAA")  # user node
-            added_users.add(user)
-        if comm not in added_comms:
-            net.add_node(comm, label=comm, color="#AAAACC")  # community node
-            added_comms.add(comm)
-        net.add_edge(user, comm)
-    net.set_options("""
-    var options = {
-      "nodes": {
-        "scaling": {
-          "min": 10,
-          "max": 30
-        }
-      },
-      "edges": {
-        "smooth": {
-          "type": "continuous"
-        }
-      },
-      "physics": {
-        "barnesHut": {
-          "gravitationalConstant": -8000,
-          "springLength": 250
-        }
-      }
-    }
-    """)
-    # Generate network HTML
-    net.save_graph("network.html")
-    html_file = open("network.html", "r", encoding="utf-8")
-    components.html(html_file.read(), height=620)
-    html_file.close()
-else:
-    st.info("Cannot build a network diagram without both user and community/subreddit columns.")
-# --------------------------------------------------------------------------------
-# --------------------------- Pie Chart of Top Contributors -----------------------
-# --------------------------------------------------------------------------------
 if community_col in df.columns:
     st.markdown("### Top Communities/Accounts Contributions")
     contributions = df[community_col].value_counts().reset_index()
@@ -239,9 +162,6 @@ if community_col in df.columns:
 else:
     st.info("No community or account data available for contributor pie chart.")
-# --------------------------------------------------------------------------------
-# ---------------------- Top Hashtags & Sentiment Analysis -----------------------
-# --------------------------------------------------------------------------------
 # Top Hashtags Bar Chart
 if hashtags_col in df.columns:
     st.markdown("### Top Hashtags")
@@ -250,11 +170,9 @@ if hashtags_col in df.columns:
     top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
     top_hashtags.columns = ['hashtag', 'count']
     if not top_hashtags.empty:
-        fig_hashtags = px.bar(
-            top_hashtags.head(10), x='hashtag', y='count',
-            labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
-            title="Top 10 Hashtags"
-        )
         st.plotly_chart(fig_hashtags)
     else:
         st.info("No hashtag data available.")
@@ -265,19 +183,19 @@ else:
 if text_col is not None and text_col in df.columns:
     st.markdown("### Sentiment Analysis")
     df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
-    fig_sentiment = px.histogram(
-        df, x='sentiment', nbins=30,
-        labels={'sentiment': 'Sentiment Polarity'},
-        title="Sentiment Polarity Distribution"
-    )
     st.plotly_chart(fig_sentiment)
 else:
     st.info(f"No '{text_col}' column available for sentiment analysis.")
 # --------------------------------------------------------------------------------
-# ------------------------- Topic Embedding Visualization -------------------------
 # --------------------------------------------------------------------------------
-st.markdown("## Topic Embedding Visualization (LDA + TSNE)")
 if text_col in df.columns:
     texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
     vectorizer = CountVectorizer(stop_words='english', max_features=1000)
@@ -285,57 +203,46 @@ if text_col in df.columns:
     lda = LatentDirichletAllocation(n_components=5, random_state=42)
     topic_matrix = lda.fit_transform(X)
     dominant_topic = topic_matrix.argmax(axis=1)
     tsne_model = TSNE(n_components=2, random_state=42)
     tsne_values = tsne_model.fit_transform(topic_matrix)
     tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
     tsne_df["Dominant Topic"] = dominant_topic.astype(str)
-    fig_topics = px.scatter(
-        tsne_df, x="x", y="y", color="Dominant Topic",
-        title="TSNE Embedding of Topics"
-    )
     st.plotly_chart(fig_topics)
 else:
     st.info("No text data available for topic embedding.")
-# --------------------------------------------------------------------------------
-# ----------------------- GenAI Summary for Time Series Plot ---------------------
-# --------------------------------------------------------------------------------
 st.markdown("## GenAI Summary for Time Series")
 if timestamp_col in df.columns:
-    time_df = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
-    if not time_df.empty:
-        start = time_df[timestamp_col].min()
-        end = time_df[timestamp_col].max()
-        avg_posts = time_df["count"].mean()
-        peak = time_df.loc[time_df["count"].idxmax()]
-        description = (
-            f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
-            f"The highest activity was on {peak[timestamp_col]} with {peak['count']} posts."
-        )
         st.write("Time Series Description:")
         st.write(description)
-        # Use a smaller, faster FLAN-T5 model
-        ts_summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
         try:
-            # We'll prompt it in a summarization style for clarity
-            prompt = f"Summarize this data description: {description}"
-            ts_summary = ts_summarizer(prompt, max_length=80, do_sample=False)[0]['generated_text']
             st.markdown("**GenAI Summary:**")
             st.write(ts_summary)
         except Exception as e:
-            st.error(f"Error generating time series summary: {e}")
     else:
-        st.info("No data available for time series summarization.")
 else:
     st.info("No timestamp column available for time series summary.")
-# --------------------------------------------------------------------------------
-# ----------------------- Offline Events from Wikipedia --------------------------
-# --------------------------------------------------------------------------------
 st.markdown("## Offline Events from Wikipedia")
 wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
 if wiki_topic:
@@ -344,59 +251,39 @@ if wiki_topic:
         st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
         st.write(wiki_summary)
     except Exception as e:
-        st.error(f"Error retrieving Wikipedia data: {e}")
-# --------------------------------------------------------------------------------
-# ----------------- Semantic Search on Posts using Sentence Transformers ---------
-# --------------------------------------------------------------------------------
 st.markdown("## Semantic Search on Posts")
-if text_col and text_col in df.columns:
-    search_query = st.text_input("Enter your semantic search query:")
-    if search_query:
-        @st.cache_data
-        def get_post_embeddings(texts):
-            # Use a smaller, faster model
-            model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
-            return model.encode(texts, convert_to_tensor=True)
-        posts = df[text_col].dropna().tolist()
-        if posts:
-            embeddings = get_post_embeddings(posts)
-            model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
-            query_embedding = model.encode(search_query, convert_to_tensor=True)
-            cos_scores = util.cos_sim(query_embedding, embeddings)[0]
-            top_results = cos_scores.topk(5)
-            st.markdown("**Top Matching Posts:**")
-            for score, idx in zip(top_results.values, top_results.indices):
-                st.write(f"Score: {score.item():.3f}")
-                st.write(posts[idx])
-                st.write("---")
-        else:
-            st.info("No text data available for semantic search.")
-else:
-    st.info("No text column available to perform semantic search.")
-# --------------------------------------------------------------------------------
-# ------------------------ AI-Generated Summary of Posts -------------------------
-# --------------------------------------------------------------------------------
 st.markdown("## AI-Generated Summary of Posts")
 if text_col in df.columns:
-    # Use the same FLAN-T5 base model or DistilBart for summarization
-    summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
     def generate_summary(text, summarizer, max_chunk_length=1000):
-        """
-        Break text into chunks of up to max_chunk_length,
-        and pass them through the summarizer in sequence,
-        then do a final summarization pass on the combined summary.
-        """
-        sentences = text.split('. ')
         chunks, current_chunk = [], ""
-        for sentence in sentences:
             sentence = sentence.strip() + ". "
             if len(current_chunk) + len(sentence) <= max_chunk_length:
                 current_chunk += sentence
@@ -406,23 +293,21 @@ if text_col in df.columns:
         if current_chunk:
             chunks.append(current_chunk.strip())
-        # Summarize each chunk
-        interim_summaries = []
         for chunk in chunks:
             if len(chunk) > 50:
-                prompt = f"Summarize this text: {chunk}"
-                summary_chunk = summarizer(prompt, max_length=150, do_sample=False)[0]['generated_text']
-                interim_summaries.append(summary_chunk)
-        # Summarize the combined interim summary
-        combined_summary = " ".join(interim_summaries)
-        final_prompt = f"Summarize this overall text: {combined_summary}"
-        final_summary = summarizer(final_prompt, max_length=150, do_sample=False)[0]['generated_text']
         return final_summary
-    # Take a sample of up to 10 random posts
     sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
-    if sample_text.strip():
         final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
         st.write(final_summary)
     else:
@@ -435,15 +320,6 @@ else:
 # --------------------------------------------------------------------------------
 st.markdown("### End of Dashboard")
 st.markdown("""
-This dashboard is a prototype for analyzing Reddit social media data.
-It demonstrates:
-- Trend analysis with a 7-day moving average
-- A user-to-community network diagram
-- Top contributors and hashtags
-- Sentiment analysis
-- Topic embeddings with LDA + t-SNE
-- **GenAI time series summary** (FLAN-T5)
-- **Offline Wikipedia events** integration
-- **Semantic search** with Sentence Transformers
-- **Full AI-generated summary** of posts
 """)

 DATA_PATH = "data.jsonl"
 if not os.path.exists(DATA_PATH):
     st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
 else:
     raw_df = load_raw_data(DATA_PATH)
 else:
     text_col = None
+# For hashtags: if not provided, extract from text using regex.
 if "hashtags" not in df.columns:
     def extract_hashtags(row):
         text = ""
         end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
         if start_date > end_date:
             st.sidebar.error("Error: End date must fall after start date.")
         df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
     except Exception as e:
         st.sidebar.error("Error processing the timestamp column for filtering.")
     df["date"] = df[timestamp_col].dt.date
     time_series = df.groupby("date").size().reset_index(name="count")
     time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
+    fig_time = px.line(time_series, x="date", y=["count", "7-day Moving Avg"],
+                       labels={"date": "Date", "value": "Number of Posts"},
+                       title="Posts Over Time with 7-day Moving Average")
     st.plotly_chart(fig_time)
 else:
     st.info("No timestamp data available for time series plot.")
+# Pie Chart of Top Contributors (using subreddit if available, otherwise author)
 community_col = "subreddit" if "subreddit" in df.columns else user_col
 if community_col in df.columns:
     st.markdown("### Top Communities/Accounts Contributions")
     contributions = df[community_col].value_counts().reset_index()
 else:
     st.info("No community or account data available for contributor pie chart.")
 # Top Hashtags Bar Chart
 if hashtags_col in df.columns:
     st.markdown("### Top Hashtags")
     top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
     top_hashtags.columns = ['hashtag', 'count']
     if not top_hashtags.empty:
+        fig_hashtags = px.bar(top_hashtags.head(10), x='hashtag', y='count',
+                              labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
+                              title="Top 10 Hashtags")
         st.plotly_chart(fig_hashtags)
     else:
         st.info("No hashtag data available.")
 if text_col is not None and text_col in df.columns:
     st.markdown("### Sentiment Analysis")
     df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
+    fig_sentiment = px.histogram(df, x='sentiment', nbins=30,
+                                 labels={'sentiment': 'Sentiment Polarity'},
+                                 title="Sentiment Polarity Distribution")
     st.plotly_chart(fig_sentiment)
 else:
     st.info(f"No '{text_col}' column available for sentiment analysis.")
 # --------------------------------------------------------------------------------
+# ------------------------------ Additional Features -----------------------------
 # --------------------------------------------------------------------------------
+# (a) Topic Embedding Visualization using LDA + TSNE
+st.markdown("## Topic Embedding Visualization")
 if text_col in df.columns:
     texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
     vectorizer = CountVectorizer(stop_words='english', max_features=1000)
     lda = LatentDirichletAllocation(n_components=5, random_state=42)
     topic_matrix = lda.fit_transform(X)
     dominant_topic = topic_matrix.argmax(axis=1)
     tsne_model = TSNE(n_components=2, random_state=42)
     tsne_values = tsne_model.fit_transform(topic_matrix)
     tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
     tsne_df["Dominant Topic"] = dominant_topic.astype(str)
+    fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
+                            title="TSNE Embedding of Topics")
     st.plotly_chart(fig_topics)
 else:
     st.info("No text data available for topic embedding.")
+# (b) GenAI Summary for Time Series Plot
 st.markdown("## GenAI Summary for Time Series")
 if timestamp_col in df.columns:
+    time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
+    if not time_series.empty:
+        start = time_series["created_utc"].min()
+        end = time_series["created_utc"].max()
+        avg_posts = time_series["count"].mean()
+        peak = time_series.loc[time_series["count"].idxmax()]
+        description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
+                       f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
         st.write("Time Series Description:")
         st.write(description)
+        # Use a smaller, faster summarization model
+        ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
         try:
+            ts_summary = ts_summarizer(
+                description, max_length=80, min_length=40, do_sample=False
+            )[0]['summary_text']
             st.markdown("**GenAI Summary:**")
             st.write(ts_summary)
         except Exception as e:
+            st.error("Error generating time series summary.")
     else:
+        st.info("Time series data not available for summarization.")
 else:
     st.info("No timestamp column available for time series summary.")
+# (c) Offline Events from Wikipedia
 st.markdown("## Offline Events from Wikipedia")
 wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
 if wiki_topic:
         st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
         st.write(wiki_summary)
     except Exception as e:
+        st.error("Error retrieving Wikipedia data. Please check the topic name.")
+# (d) Semantic Search on Posts using Sentence Transformers
 st.markdown("## Semantic Search on Posts")
+search_query = st.text_input("Enter your semantic search query:")
+if search_query and text_col in df.columns:
+    @st.cache_data
+    def get_post_embeddings(texts):
+        # Use a smaller, faster model
+        model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
+        return model.encode(texts, convert_to_tensor=True)
+    posts = df[text_col].dropna().tolist()
+    embeddings = get_post_embeddings(posts)
+    model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
+    query_embedding = model.encode(search_query, convert_to_tensor=True)
+    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
+    top_results = cos_scores.topk(5)
+    st.markdown("**Top Matching Posts:**")
+    for score, idx in zip(top_results.values, top_results.indices):
+        st.write(f"Score: {score.item():.3f}")
+        st.write(posts[idx])
+        st.write("---")
+# (e) AI-Generated Summary of Posts
 st.markdown("## AI-Generated Summary of Posts")
 if text_col in df.columns:
+    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
     def generate_summary(text, summarizer, max_chunk_length=1000):
         chunks, current_chunk = [], ""
+        for sentence in text.split('. '):
             sentence = sentence.strip() + ". "
             if len(current_chunk) + len(sentence) <= max_chunk_length:
                 current_chunk += sentence
         if current_chunk:
             chunks.append(current_chunk.strip())
+        summaries = []
         for chunk in chunks:
             if len(chunk) > 50:
+                summary_chunk = summarizer(
+                    chunk, max_length=150, min_length=40, do_sample=False
+                )[0]['summary_text']
+                summaries.append(summary_chunk)
+        combined_summary = " ".join(summaries)
+        final_summary = summarizer(
+            combined_summary, max_length=150, min_length=40, do_sample=False
+        )[0]['summary_text']
         return final_summary
     sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
+    if sample_text:
         final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
         st.write(final_summary)
     else:
 # --------------------------------------------------------------------------------
 st.markdown("### End of Dashboard")
 st.markdown("""
+This dashboard is a prototype implementation for analyzing Reddit social media data.
+It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
 """)