kushalpatel0265 commited on
Commit
aec0fd0
·
verified ·
1 Parent(s): 6d185a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -72
app.py CHANGED
@@ -33,6 +33,7 @@ def load_raw_data(filepath):
33
  DATA_PATH = "data.jsonl"
34
  if not os.path.exists(DATA_PATH):
35
  st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
 
36
  else:
37
  raw_df = load_raw_data(DATA_PATH)
38
 
@@ -67,7 +68,7 @@ elif "title" in df.columns:
67
  else:
68
  text_col = None
69
 
70
- # For hashtags: if not provided, extract from text using regex.
71
  if "hashtags" not in df.columns:
72
  def extract_hashtags(row):
73
  text = ""
@@ -105,6 +106,7 @@ if timestamp_col in df.columns:
105
  end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
106
  if start_date > end_date:
107
  st.sidebar.error("Error: End date must fall after start date.")
 
108
  df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
109
  except Exception as e:
110
  st.sidebar.error("Error processing the timestamp column for filtering.")
@@ -142,15 +144,90 @@ if timestamp_col in df.columns:
142
  df["date"] = df[timestamp_col].dt.date
143
  time_series = df.groupby("date").size().reset_index(name="count")
144
  time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
145
- fig_time = px.line(time_series, x="date", y=["count", "7-day Moving Avg"],
146
- labels={"date": "Date", "value": "Number of Posts"},
147
- title="Posts Over Time with 7-day Moving Average")
 
 
148
  st.plotly_chart(fig_time)
149
  else:
150
  st.info("No timestamp data available for time series plot.")
151
 
152
- # Pie Chart of Top Contributors (using subreddit if available, otherwise author)
 
 
 
 
 
 
 
 
153
  community_col = "subreddit" if "subreddit" in df.columns else user_col
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  if community_col in df.columns:
155
  st.markdown("### Top Communities/Accounts Contributions")
156
  contributions = df[community_col].value_counts().reset_index()
@@ -162,6 +239,9 @@ if community_col in df.columns:
162
  else:
163
  st.info("No community or account data available for contributor pie chart.")
164
 
 
 
 
165
  # Top Hashtags Bar Chart
166
  if hashtags_col in df.columns:
167
  st.markdown("### Top Hashtags")
@@ -170,9 +250,11 @@ if hashtags_col in df.columns:
170
  top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
171
  top_hashtags.columns = ['hashtag', 'count']
172
  if not top_hashtags.empty:
173
- fig_hashtags = px.bar(top_hashtags.head(10), x='hashtag', y='count',
174
- labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
175
- title="Top 10 Hashtags")
 
 
176
  st.plotly_chart(fig_hashtags)
177
  else:
178
  st.info("No hashtag data available.")
@@ -183,19 +265,19 @@ else:
183
  if text_col is not None and text_col in df.columns:
184
  st.markdown("### Sentiment Analysis")
185
  df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
186
- fig_sentiment = px.histogram(df, x='sentiment', nbins=30,
187
- labels={'sentiment': 'Sentiment Polarity'},
188
- title="Sentiment Polarity Distribution")
 
 
189
  st.plotly_chart(fig_sentiment)
190
  else:
191
  st.info(f"No '{text_col}' column available for sentiment analysis.")
192
 
193
  # --------------------------------------------------------------------------------
194
- # ------------------------------ Additional Features -----------------------------
195
  # --------------------------------------------------------------------------------
196
-
197
- # (a) Topic Embedding Visualization using LDA + TSNE
198
- st.markdown("## Topic Embedding Visualization")
199
  if text_col in df.columns:
200
  texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
201
  vectorizer = CountVectorizer(stop_words='english', max_features=1000)
@@ -203,46 +285,57 @@ if text_col in df.columns:
203
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
204
  topic_matrix = lda.fit_transform(X)
205
  dominant_topic = topic_matrix.argmax(axis=1)
 
206
  tsne_model = TSNE(n_components=2, random_state=42)
207
  tsne_values = tsne_model.fit_transform(topic_matrix)
208
  tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
209
  tsne_df["Dominant Topic"] = dominant_topic.astype(str)
210
- fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
211
- title="TSNE Embedding of Topics")
 
 
 
212
  st.plotly_chart(fig_topics)
213
  else:
214
  st.info("No text data available for topic embedding.")
215
 
216
- # (b) GenAI Summary for Time Series Plot
 
 
217
  st.markdown("## GenAI Summary for Time Series")
218
  if timestamp_col in df.columns:
219
- time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
220
- if not time_series.empty:
221
- start = time_series["created_utc"].min()
222
- end = time_series["created_utc"].max()
223
- avg_posts = time_series["count"].mean()
224
- peak = time_series.loc[time_series["count"].idxmax()]
225
- description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
226
- f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
 
 
 
227
  st.write("Time Series Description:")
228
  st.write(description)
229
 
230
- # Use a smaller, faster summarization model
231
- ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
232
  try:
233
- ts_summary = ts_summarizer(
234
- description, max_length=80, min_length=40, do_sample=False
235
- )[0]['summary_text']
236
  st.markdown("**GenAI Summary:**")
237
  st.write(ts_summary)
238
  except Exception as e:
239
- st.error("Error generating time series summary.")
240
  else:
241
- st.info("Time series data not available for summarization.")
242
  else:
243
  st.info("No timestamp column available for time series summary.")
244
 
245
- # (c) Offline Events from Wikipedia
 
 
246
  st.markdown("## Offline Events from Wikipedia")
247
  wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
248
  if wiki_topic:
@@ -251,39 +344,59 @@ if wiki_topic:
251
  st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
252
  st.write(wiki_summary)
253
  except Exception as e:
254
- st.error("Error retrieving Wikipedia data. Please check the topic name.")
255
 
256
- # (d) Semantic Search on Posts using Sentence Transformers
 
 
257
  st.markdown("## Semantic Search on Posts")
258
- search_query = st.text_input("Enter your semantic search query:")
259
- if search_query and text_col in df.columns:
260
- @st.cache_data
261
- def get_post_embeddings(texts):
262
- # Use a smaller, faster model
263
- model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
264
- return model.encode(texts, convert_to_tensor=True)
265
-
266
- posts = df[text_col].dropna().tolist()
267
- embeddings = get_post_embeddings(posts)
268
- model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
269
- query_embedding = model.encode(search_query, convert_to_tensor=True)
270
- cos_scores = util.cos_sim(query_embedding, embeddings)[0]
271
- top_results = cos_scores.topk(5)
272
-
273
- st.markdown("**Top Matching Posts:**")
274
- for score, idx in zip(top_results.values, top_results.indices):
275
- st.write(f"Score: {score.item():.3f}")
276
- st.write(posts[idx])
277
- st.write("---")
278
-
279
- # (e) AI-Generated Summary of Posts
 
 
 
 
 
 
 
 
 
 
280
  st.markdown("## AI-Generated Summary of Posts")
281
  if text_col in df.columns:
282
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
283
 
284
  def generate_summary(text, summarizer, max_chunk_length=1000):
 
 
 
 
 
 
285
  chunks, current_chunk = [], ""
286
- for sentence in text.split('. '):
 
287
  sentence = sentence.strip() + ". "
288
  if len(current_chunk) + len(sentence) <= max_chunk_length:
289
  current_chunk += sentence
@@ -293,21 +406,23 @@ if text_col in df.columns:
293
  if current_chunk:
294
  chunks.append(current_chunk.strip())
295
 
296
- summaries = []
 
297
  for chunk in chunks:
298
  if len(chunk) > 50:
299
- summary_chunk = summarizer(
300
- chunk, max_length=150, min_length=40, do_sample=False
301
- )[0]['summary_text']
302
- summaries.append(summary_chunk)
303
- combined_summary = " ".join(summaries)
304
- final_summary = summarizer(
305
- combined_summary, max_length=150, min_length=40, do_sample=False
306
- )[0]['summary_text']
307
  return final_summary
308
 
 
309
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
310
- if sample_text:
311
  final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
312
  st.write(final_summary)
313
  else:
@@ -320,6 +435,15 @@ else:
320
  # --------------------------------------------------------------------------------
321
  st.markdown("### End of Dashboard")
322
  st.markdown("""
323
- This dashboard is a prototype implementation for analyzing Reddit social media data.
324
- It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
 
 
 
 
 
 
 
 
 
325
  """)
 
33
  DATA_PATH = "data.jsonl"
34
  if not os.path.exists(DATA_PATH):
35
  st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
36
+ st.stop()
37
  else:
38
  raw_df = load_raw_data(DATA_PATH)
39
 
 
68
  else:
69
  text_col = None
70
 
71
+ # For hashtags: if not provided, extract them from text using regex.
72
  if "hashtags" not in df.columns:
73
  def extract_hashtags(row):
74
  text = ""
 
106
  end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
107
  if start_date > end_date:
108
  st.sidebar.error("Error: End date must fall after start date.")
109
+ # Filter df between selected dates
110
  df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
111
  except Exception as e:
112
  st.sidebar.error("Error processing the timestamp column for filtering.")
 
144
  df["date"] = df[timestamp_col].dt.date
145
  time_series = df.groupby("date").size().reset_index(name="count")
146
  time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
147
+ fig_time = px.line(
148
+ time_series, x="date", y=["count", "7-day Moving Avg"],
149
+ labels={"date": "Date", "value": "Number of Posts"},
150
+ title="Posts Over Time with 7-day Moving Average"
151
+ )
152
  st.plotly_chart(fig_time)
153
  else:
154
  st.info("No timestamp data available for time series plot.")
155
 
156
+ # --------------------------------------------------------------------------------
157
+ # --------------------------- Network Diagram (Above Pie) -------------------------
158
+ # --------------------------------------------------------------------------------
159
+ """
160
+ We'll create a user <-> community network from the top users and top subreddits.
161
+ For simplicity, we only include each user/subreddit once to avoid extremely large networks.
162
+ """
163
+ st.markdown("### Network Diagram")
164
+
165
  community_col = "subreddit" if "subreddit" in df.columns else user_col
166
+
167
+ # Build a small network of user->community edges
168
+ if community_col in df.columns and user_col in df.columns:
169
+ # Let's focus on top communities
170
+ top_communities_df = df[community_col].value_counts().nlargest(5) # top 5 subreddits or communities
171
+ top_communities = set(top_communities_df.index)
172
+
173
+ # For each row, if subreddit in top_communities, link author->subreddit
174
+ # For performance, take a sample of the entire dataset or filter only relevant rows.
175
+ sub_df = df[df[community_col].isin(top_communities)].copy()
176
+ sub_df = sub_df.dropna(subset=[user_col, community_col])
177
+ sub_df = sub_df.sample(min(500, len(sub_df)), random_state=42) # sample to reduce network size
178
+
179
+ net = Network(height="600px", width="100%", notebook=False, bgcolor="#ffffff", font_color="black")
180
+
181
+ # We'll track which nodes we've added to avoid duplicates
182
+ added_users = set()
183
+ added_comms = set()
184
+
185
+ for _, row in sub_df.iterrows():
186
+ user = str(row[user_col])
187
+ comm = str(row[community_col])
188
+
189
+ if user not in added_users:
190
+ net.add_node(user, label=user, color="#FFAAAA") # user node
191
+ added_users.add(user)
192
+
193
+ if comm not in added_comms:
194
+ net.add_node(comm, label=comm, color="#AAAACC") # community node
195
+ added_comms.add(comm)
196
+
197
+ net.add_edge(user, comm)
198
+
199
+ net.set_options("""
200
+ var options = {
201
+ "nodes": {
202
+ "scaling": {
203
+ "min": 10,
204
+ "max": 30
205
+ }
206
+ },
207
+ "edges": {
208
+ "smooth": {
209
+ "type": "continuous"
210
+ }
211
+ },
212
+ "physics": {
213
+ "barnesHut": {
214
+ "gravitationalConstant": -8000,
215
+ "springLength": 250
216
+ }
217
+ }
218
+ }
219
+ """)
220
+ # Generate network HTML
221
+ net.save_graph("network.html")
222
+ html_file = open("network.html", "r", encoding="utf-8")
223
+ components.html(html_file.read(), height=620)
224
+ html_file.close()
225
+ else:
226
+ st.info("Cannot build a network diagram without both user and community/subreddit columns.")
227
+
228
+ # --------------------------------------------------------------------------------
229
+ # --------------------------- Pie Chart of Top Contributors -----------------------
230
+ # --------------------------------------------------------------------------------
231
  if community_col in df.columns:
232
  st.markdown("### Top Communities/Accounts Contributions")
233
  contributions = df[community_col].value_counts().reset_index()
 
239
  else:
240
  st.info("No community or account data available for contributor pie chart.")
241
 
242
+ # --------------------------------------------------------------------------------
243
+ # ---------------------- Top Hashtags & Sentiment Analysis -----------------------
244
+ # --------------------------------------------------------------------------------
245
  # Top Hashtags Bar Chart
246
  if hashtags_col in df.columns:
247
  st.markdown("### Top Hashtags")
 
250
  top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
251
  top_hashtags.columns = ['hashtag', 'count']
252
  if not top_hashtags.empty:
253
+ fig_hashtags = px.bar(
254
+ top_hashtags.head(10), x='hashtag', y='count',
255
+ labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
256
+ title="Top 10 Hashtags"
257
+ )
258
  st.plotly_chart(fig_hashtags)
259
  else:
260
  st.info("No hashtag data available.")
 
265
  if text_col is not None and text_col in df.columns:
266
  st.markdown("### Sentiment Analysis")
267
  df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
268
+ fig_sentiment = px.histogram(
269
+ df, x='sentiment', nbins=30,
270
+ labels={'sentiment': 'Sentiment Polarity'},
271
+ title="Sentiment Polarity Distribution"
272
+ )
273
  st.plotly_chart(fig_sentiment)
274
  else:
275
  st.info(f"No '{text_col}' column available for sentiment analysis.")
276
 
277
  # --------------------------------------------------------------------------------
278
+ # ------------------------- Topic Embedding Visualization -------------------------
279
  # --------------------------------------------------------------------------------
280
+ st.markdown("## Topic Embedding Visualization (LDA + TSNE)")
 
 
281
  if text_col in df.columns:
282
  texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
283
  vectorizer = CountVectorizer(stop_words='english', max_features=1000)
 
285
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
286
  topic_matrix = lda.fit_transform(X)
287
  dominant_topic = topic_matrix.argmax(axis=1)
288
+
289
  tsne_model = TSNE(n_components=2, random_state=42)
290
  tsne_values = tsne_model.fit_transform(topic_matrix)
291
  tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
292
  tsne_df["Dominant Topic"] = dominant_topic.astype(str)
293
+
294
+ fig_topics = px.scatter(
295
+ tsne_df, x="x", y="y", color="Dominant Topic",
296
+ title="TSNE Embedding of Topics"
297
+ )
298
  st.plotly_chart(fig_topics)
299
  else:
300
  st.info("No text data available for topic embedding.")
301
 
302
+ # --------------------------------------------------------------------------------
303
+ # ----------------------- GenAI Summary for Time Series Plot ---------------------
304
+ # --------------------------------------------------------------------------------
305
  st.markdown("## GenAI Summary for Time Series")
306
  if timestamp_col in df.columns:
307
+ time_df = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
308
+ if not time_df.empty:
309
+ start = time_df[timestamp_col].min()
310
+ end = time_df[timestamp_col].max()
311
+ avg_posts = time_df["count"].mean()
312
+ peak = time_df.loc[time_df["count"].idxmax()]
313
+ description = (
314
+ f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
315
+ f"The highest activity was on {peak[timestamp_col]} with {peak['count']} posts."
316
+ )
317
+
318
  st.write("Time Series Description:")
319
  st.write(description)
320
 
321
+ # Use a smaller, faster FLAN-T5 model
322
+ ts_summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
323
  try:
324
+ # We'll prompt it in a summarization style for clarity
325
+ prompt = f"Summarize this data description: {description}"
326
+ ts_summary = ts_summarizer(prompt, max_length=80, do_sample=False)[0]['generated_text']
327
  st.markdown("**GenAI Summary:**")
328
  st.write(ts_summary)
329
  except Exception as e:
330
+ st.error(f"Error generating time series summary: {e}")
331
  else:
332
+ st.info("No data available for time series summarization.")
333
  else:
334
  st.info("No timestamp column available for time series summary.")
335
 
336
+ # --------------------------------------------------------------------------------
337
+ # ----------------------- Offline Events from Wikipedia --------------------------
338
+ # --------------------------------------------------------------------------------
339
  st.markdown("## Offline Events from Wikipedia")
340
  wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
341
  if wiki_topic:
 
344
  st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
345
  st.write(wiki_summary)
346
  except Exception as e:
347
+ st.error(f"Error retrieving Wikipedia data: {e}")
348
 
349
+ # --------------------------------------------------------------------------------
350
+ # ----------------- Semantic Search on Posts using Sentence Transformers ---------
351
+ # --------------------------------------------------------------------------------
352
  st.markdown("## Semantic Search on Posts")
353
+ if text_col and text_col in df.columns:
354
+ search_query = st.text_input("Enter your semantic search query:")
355
+ if search_query:
356
+ @st.cache_data
357
+ def get_post_embeddings(texts):
358
+ # Use a smaller, faster model
359
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
360
+ return model.encode(texts, convert_to_tensor=True)
361
+
362
+ posts = df[text_col].dropna().tolist()
363
+
364
+ if posts:
365
+ embeddings = get_post_embeddings(posts)
366
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
367
+ query_embedding = model.encode(search_query, convert_to_tensor=True)
368
+
369
+ cos_scores = util.cos_sim(query_embedding, embeddings)[0]
370
+ top_results = cos_scores.topk(5)
371
+
372
+ st.markdown("**Top Matching Posts:**")
373
+ for score, idx in zip(top_results.values, top_results.indices):
374
+ st.write(f"Score: {score.item():.3f}")
375
+ st.write(posts[idx])
376
+ st.write("---")
377
+ else:
378
+ st.info("No text data available for semantic search.")
379
+ else:
380
+ st.info("No text column available to perform semantic search.")
381
+
382
+ # --------------------------------------------------------------------------------
383
+ # ------------------------ AI-Generated Summary of Posts -------------------------
384
+ # --------------------------------------------------------------------------------
385
  st.markdown("## AI-Generated Summary of Posts")
386
  if text_col in df.columns:
387
+ # Use the same FLAN-T5 base model or DistilBart for summarization
388
+ summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
389
 
390
  def generate_summary(text, summarizer, max_chunk_length=1000):
391
+ """
392
+ Break text into chunks of up to max_chunk_length,
393
+ and pass them through the summarizer in sequence,
394
+ then do a final summarization pass on the combined summary.
395
+ """
396
+ sentences = text.split('. ')
397
  chunks, current_chunk = [], ""
398
+
399
+ for sentence in sentences:
400
  sentence = sentence.strip() + ". "
401
  if len(current_chunk) + len(sentence) <= max_chunk_length:
402
  current_chunk += sentence
 
406
  if current_chunk:
407
  chunks.append(current_chunk.strip())
408
 
409
+ # Summarize each chunk
410
+ interim_summaries = []
411
  for chunk in chunks:
412
  if len(chunk) > 50:
413
+ prompt = f"Summarize this text: {chunk}"
414
+ summary_chunk = summarizer(prompt, max_length=150, do_sample=False)[0]['generated_text']
415
+ interim_summaries.append(summary_chunk)
416
+
417
+ # Summarize the combined interim summary
418
+ combined_summary = " ".join(interim_summaries)
419
+ final_prompt = f"Summarize this overall text: {combined_summary}"
420
+ final_summary = summarizer(final_prompt, max_length=150, do_sample=False)[0]['generated_text']
421
  return final_summary
422
 
423
+ # Take a sample of up to 10 random posts
424
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
425
+ if sample_text.strip():
426
  final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
427
  st.write(final_summary)
428
  else:
 
435
  # --------------------------------------------------------------------------------
436
  st.markdown("### End of Dashboard")
437
  st.markdown("""
438
+ This dashboard is a prototype for analyzing Reddit social media data.
439
+ It demonstrates:
440
+ - Trend analysis with a 7-day moving average
441
+ - A user-to-community network diagram
442
+ - Top contributors and hashtags
443
+ - Sentiment analysis
444
+ - Topic embeddings with LDA + t-SNE
445
+ - **GenAI time series summary** (FLAN-T5)
446
+ - **Offline Wikipedia events** integration
447
+ - **Semantic search** with Sentence Transformers
448
+ - **Full AI-generated summary** of posts
449
  """)