kushalpatel0265 commited on
Commit
def6165
·
verified ·
1 Parent(s): 17c9f49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -196
app.py CHANGED
@@ -33,7 +33,6 @@ def load_raw_data(filepath):
33
  DATA_PATH = "data.jsonl"
34
  if not os.path.exists(DATA_PATH):
35
  st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
36
- st.stop()
37
  else:
38
  raw_df = load_raw_data(DATA_PATH)
39
 
@@ -68,7 +67,7 @@ elif "title" in df.columns:
68
  else:
69
  text_col = None
70
 
71
- # For hashtags: if not provided, extract them from text using regex.
72
  if "hashtags" not in df.columns:
73
  def extract_hashtags(row):
74
  text = ""
@@ -106,7 +105,6 @@ if timestamp_col in df.columns:
106
  end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
107
  if start_date > end_date:
108
  st.sidebar.error("Error: End date must fall after start date.")
109
- # Filter df between selected dates
110
  df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
111
  except Exception as e:
112
  st.sidebar.error("Error processing the timestamp column for filtering.")
@@ -144,90 +142,15 @@ if timestamp_col in df.columns:
144
  df["date"] = df[timestamp_col].dt.date
145
  time_series = df.groupby("date").size().reset_index(name="count")
146
  time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
147
- fig_time = px.line(
148
- time_series, x="date", y=["count", "7-day Moving Avg"],
149
- labels={"date": "Date", "value": "Number of Posts"},
150
- title="Posts Over Time with 7-day Moving Average"
151
- )
152
  st.plotly_chart(fig_time)
153
  else:
154
  st.info("No timestamp data available for time series plot.")
155
 
156
- # --------------------------------------------------------------------------------
157
- # --------------------------- Network Diagram (Above Pie) -------------------------
158
- # --------------------------------------------------------------------------------
159
- """
160
- We'll create a user <-> community network from the top users and top subreddits.
161
- For simplicity, we only include each user/subreddit once to avoid extremely large networks.
162
- """
163
- st.markdown("### Network Diagram")
164
-
165
  community_col = "subreddit" if "subreddit" in df.columns else user_col
166
-
167
- # Build a small network of user->community edges
168
- if community_col in df.columns and user_col in df.columns:
169
- # Let's focus on top communities
170
- top_communities_df = df[community_col].value_counts().nlargest(5) # top 5 subreddits or communities
171
- top_communities = set(top_communities_df.index)
172
-
173
- # For each row, if subreddit in top_communities, link author->subreddit
174
- # For performance, take a sample of the entire dataset or filter only relevant rows.
175
- sub_df = df[df[community_col].isin(top_communities)].copy()
176
- sub_df = sub_df.dropna(subset=[user_col, community_col])
177
- sub_df = sub_df.sample(min(500, len(sub_df)), random_state=42) # sample to reduce network size
178
-
179
- net = Network(height="600px", width="100%", notebook=False, bgcolor="#ffffff", font_color="black")
180
-
181
- # We'll track which nodes we've added to avoid duplicates
182
- added_users = set()
183
- added_comms = set()
184
-
185
- for _, row in sub_df.iterrows():
186
- user = str(row[user_col])
187
- comm = str(row[community_col])
188
-
189
- if user not in added_users:
190
- net.add_node(user, label=user, color="#FFAAAA") # user node
191
- added_users.add(user)
192
-
193
- if comm not in added_comms:
194
- net.add_node(comm, label=comm, color="#AAAACC") # community node
195
- added_comms.add(comm)
196
-
197
- net.add_edge(user, comm)
198
-
199
- net.set_options("""
200
- var options = {
201
- "nodes": {
202
- "scaling": {
203
- "min": 10,
204
- "max": 30
205
- }
206
- },
207
- "edges": {
208
- "smooth": {
209
- "type": "continuous"
210
- }
211
- },
212
- "physics": {
213
- "barnesHut": {
214
- "gravitationalConstant": -8000,
215
- "springLength": 250
216
- }
217
- }
218
- }
219
- """)
220
- # Generate network HTML
221
- net.save_graph("network.html")
222
- html_file = open("network.html", "r", encoding="utf-8")
223
- components.html(html_file.read(), height=620)
224
- html_file.close()
225
- else:
226
- st.info("Cannot build a network diagram without both user and community/subreddit columns.")
227
-
228
- # --------------------------------------------------------------------------------
229
- # --------------------------- Pie Chart of Top Contributors -----------------------
230
- # --------------------------------------------------------------------------------
231
  if community_col in df.columns:
232
  st.markdown("### Top Communities/Accounts Contributions")
233
  contributions = df[community_col].value_counts().reset_index()
@@ -239,9 +162,6 @@ if community_col in df.columns:
239
  else:
240
  st.info("No community or account data available for contributor pie chart.")
241
 
242
- # --------------------------------------------------------------------------------
243
- # ---------------------- Top Hashtags & Sentiment Analysis -----------------------
244
- # --------------------------------------------------------------------------------
245
  # Top Hashtags Bar Chart
246
  if hashtags_col in df.columns:
247
  st.markdown("### Top Hashtags")
@@ -250,11 +170,9 @@ if hashtags_col in df.columns:
250
  top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
251
  top_hashtags.columns = ['hashtag', 'count']
252
  if not top_hashtags.empty:
253
- fig_hashtags = px.bar(
254
- top_hashtags.head(10), x='hashtag', y='count',
255
- labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
256
- title="Top 10 Hashtags"
257
- )
258
  st.plotly_chart(fig_hashtags)
259
  else:
260
  st.info("No hashtag data available.")
@@ -265,19 +183,19 @@ else:
265
  if text_col is not None and text_col in df.columns:
266
  st.markdown("### Sentiment Analysis")
267
  df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
268
- fig_sentiment = px.histogram(
269
- df, x='sentiment', nbins=30,
270
- labels={'sentiment': 'Sentiment Polarity'},
271
- title="Sentiment Polarity Distribution"
272
- )
273
  st.plotly_chart(fig_sentiment)
274
  else:
275
  st.info(f"No '{text_col}' column available for sentiment analysis.")
276
 
277
  # --------------------------------------------------------------------------------
278
- # ------------------------- Topic Embedding Visualization -------------------------
279
  # --------------------------------------------------------------------------------
280
- st.markdown("## Topic Embedding Visualization (LDA + TSNE)")
 
 
281
  if text_col in df.columns:
282
  texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
283
  vectorizer = CountVectorizer(stop_words='english', max_features=1000)
@@ -285,57 +203,46 @@ if text_col in df.columns:
285
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
286
  topic_matrix = lda.fit_transform(X)
287
  dominant_topic = topic_matrix.argmax(axis=1)
288
-
289
  tsne_model = TSNE(n_components=2, random_state=42)
290
  tsne_values = tsne_model.fit_transform(topic_matrix)
291
  tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
292
  tsne_df["Dominant Topic"] = dominant_topic.astype(str)
293
-
294
- fig_topics = px.scatter(
295
- tsne_df, x="x", y="y", color="Dominant Topic",
296
- title="TSNE Embedding of Topics"
297
- )
298
  st.plotly_chart(fig_topics)
299
  else:
300
  st.info("No text data available for topic embedding.")
301
 
302
- # --------------------------------------------------------------------------------
303
- # ----------------------- GenAI Summary for Time Series Plot ---------------------
304
- # --------------------------------------------------------------------------------
305
  st.markdown("## GenAI Summary for Time Series")
306
  if timestamp_col in df.columns:
307
- time_df = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
308
- if not time_df.empty:
309
- start = time_df[timestamp_col].min()
310
- end = time_df[timestamp_col].max()
311
- avg_posts = time_df["count"].mean()
312
- peak = time_df.loc[time_df["count"].idxmax()]
313
- description = (
314
- f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
315
- f"The highest activity was on {peak[timestamp_col]} with {peak['count']} posts."
316
- )
317
-
318
  st.write("Time Series Description:")
319
  st.write(description)
320
 
321
- # Use a smaller, faster FLAN-T5 model
322
- ts_summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
323
  try:
324
- # We'll prompt it in a summarization style for clarity
325
- prompt = f"Summarize this data description: {description}"
326
- ts_summary = ts_summarizer(prompt, max_length=80, do_sample=False)[0]['generated_text']
327
  st.markdown("**GenAI Summary:**")
328
  st.write(ts_summary)
329
  except Exception as e:
330
- st.error(f"Error generating time series summary: {e}")
331
  else:
332
- st.info("No data available for time series summarization.")
333
  else:
334
  st.info("No timestamp column available for time series summary.")
335
 
336
- # --------------------------------------------------------------------------------
337
- # ----------------------- Offline Events from Wikipedia --------------------------
338
- # --------------------------------------------------------------------------------
339
  st.markdown("## Offline Events from Wikipedia")
340
  wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
341
  if wiki_topic:
@@ -344,59 +251,39 @@ if wiki_topic:
344
  st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
345
  st.write(wiki_summary)
346
  except Exception as e:
347
- st.error(f"Error retrieving Wikipedia data: {e}")
348
 
349
- # --------------------------------------------------------------------------------
350
- # ----------------- Semantic Search on Posts using Sentence Transformers ---------
351
- # --------------------------------------------------------------------------------
352
  st.markdown("## Semantic Search on Posts")
353
- if text_col and text_col in df.columns:
354
- search_query = st.text_input("Enter your semantic search query:")
355
- if search_query:
356
- @st.cache_data
357
- def get_post_embeddings(texts):
358
- # Use a smaller, faster model
359
- model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
360
- return model.encode(texts, convert_to_tensor=True)
361
-
362
- posts = df[text_col].dropna().tolist()
363
-
364
- if posts:
365
- embeddings = get_post_embeddings(posts)
366
- model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
367
- query_embedding = model.encode(search_query, convert_to_tensor=True)
368
-
369
- cos_scores = util.cos_sim(query_embedding, embeddings)[0]
370
- top_results = cos_scores.topk(5)
371
-
372
- st.markdown("**Top Matching Posts:**")
373
- for score, idx in zip(top_results.values, top_results.indices):
374
- st.write(f"Score: {score.item():.3f}")
375
- st.write(posts[idx])
376
- st.write("---")
377
- else:
378
- st.info("No text data available for semantic search.")
379
- else:
380
- st.info("No text column available to perform semantic search.")
381
-
382
- # --------------------------------------------------------------------------------
383
- # ------------------------ AI-Generated Summary of Posts -------------------------
384
- # --------------------------------------------------------------------------------
385
  st.markdown("## AI-Generated Summary of Posts")
386
  if text_col in df.columns:
387
- # Use the same FLAN-T5 base model or DistilBart for summarization
388
- summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
389
 
390
  def generate_summary(text, summarizer, max_chunk_length=1000):
391
- """
392
- Break text into chunks of up to max_chunk_length,
393
- and pass them through the summarizer in sequence,
394
- then do a final summarization pass on the combined summary.
395
- """
396
- sentences = text.split('. ')
397
  chunks, current_chunk = [], ""
398
-
399
- for sentence in sentences:
400
  sentence = sentence.strip() + ". "
401
  if len(current_chunk) + len(sentence) <= max_chunk_length:
402
  current_chunk += sentence
@@ -406,23 +293,21 @@ if text_col in df.columns:
406
  if current_chunk:
407
  chunks.append(current_chunk.strip())
408
 
409
- # Summarize each chunk
410
- interim_summaries = []
411
  for chunk in chunks:
412
  if len(chunk) > 50:
413
- prompt = f"Summarize this text: {chunk}"
414
- summary_chunk = summarizer(prompt, max_length=150, do_sample=False)[0]['generated_text']
415
- interim_summaries.append(summary_chunk)
416
-
417
- # Summarize the combined interim summary
418
- combined_summary = " ".join(interim_summaries)
419
- final_prompt = f"Summarize this overall text: {combined_summary}"
420
- final_summary = summarizer(final_prompt, max_length=150, do_sample=False)[0]['generated_text']
421
  return final_summary
422
 
423
- # Take a sample of up to 10 random posts
424
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
425
- if sample_text.strip():
426
  final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
427
  st.write(final_summary)
428
  else:
@@ -435,15 +320,6 @@ else:
435
  # --------------------------------------------------------------------------------
436
  st.markdown("### End of Dashboard")
437
  st.markdown("""
438
- This dashboard is a prototype for analyzing Reddit social media data.
439
- It demonstrates:
440
- - Trend analysis with a 7-day moving average
441
- - A user-to-community network diagram
442
- - Top contributors and hashtags
443
- - Sentiment analysis
444
- - Topic embeddings with LDA + t-SNE
445
- - **GenAI time series summary** (FLAN-T5)
446
- - **Offline Wikipedia events** integration
447
- - **Semantic search** with Sentence Transformers
448
- - **Full AI-generated summary** of posts
449
  """)
 
33
  DATA_PATH = "data.jsonl"
34
  if not os.path.exists(DATA_PATH):
35
  st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
 
36
  else:
37
  raw_df = load_raw_data(DATA_PATH)
38
 
 
67
  else:
68
  text_col = None
69
 
70
+ # For hashtags: if not provided, extract from text using regex.
71
  if "hashtags" not in df.columns:
72
  def extract_hashtags(row):
73
  text = ""
 
105
  end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
106
  if start_date > end_date:
107
  st.sidebar.error("Error: End date must fall after start date.")
 
108
  df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
109
  except Exception as e:
110
  st.sidebar.error("Error processing the timestamp column for filtering.")
 
142
  df["date"] = df[timestamp_col].dt.date
143
  time_series = df.groupby("date").size().reset_index(name="count")
144
  time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
145
+ fig_time = px.line(time_series, x="date", y=["count", "7-day Moving Avg"],
146
+ labels={"date": "Date", "value": "Number of Posts"},
147
+ title="Posts Over Time with 7-day Moving Average")
 
 
148
  st.plotly_chart(fig_time)
149
  else:
150
  st.info("No timestamp data available for time series plot.")
151
 
152
+ # Pie Chart of Top Contributors (using subreddit if available, otherwise author)
 
 
 
 
 
 
 
 
153
  community_col = "subreddit" if "subreddit" in df.columns else user_col
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  if community_col in df.columns:
155
  st.markdown("### Top Communities/Accounts Contributions")
156
  contributions = df[community_col].value_counts().reset_index()
 
162
  else:
163
  st.info("No community or account data available for contributor pie chart.")
164
 
 
 
 
165
  # Top Hashtags Bar Chart
166
  if hashtags_col in df.columns:
167
  st.markdown("### Top Hashtags")
 
170
  top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
171
  top_hashtags.columns = ['hashtag', 'count']
172
  if not top_hashtags.empty:
173
+ fig_hashtags = px.bar(top_hashtags.head(10), x='hashtag', y='count',
174
+ labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
175
+ title="Top 10 Hashtags")
 
 
176
  st.plotly_chart(fig_hashtags)
177
  else:
178
  st.info("No hashtag data available.")
 
183
  if text_col is not None and text_col in df.columns:
184
  st.markdown("### Sentiment Analysis")
185
  df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
186
+ fig_sentiment = px.histogram(df, x='sentiment', nbins=30,
187
+ labels={'sentiment': 'Sentiment Polarity'},
188
+ title="Sentiment Polarity Distribution")
 
 
189
  st.plotly_chart(fig_sentiment)
190
  else:
191
  st.info(f"No '{text_col}' column available for sentiment analysis.")
192
 
193
  # --------------------------------------------------------------------------------
194
+ # ------------------------------ Additional Features -----------------------------
195
  # --------------------------------------------------------------------------------
196
+
197
+ # (a) Topic Embedding Visualization using LDA + TSNE
198
+ st.markdown("## Topic Embedding Visualization")
199
  if text_col in df.columns:
200
  texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
201
  vectorizer = CountVectorizer(stop_words='english', max_features=1000)
 
203
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
204
  topic_matrix = lda.fit_transform(X)
205
  dominant_topic = topic_matrix.argmax(axis=1)
 
206
  tsne_model = TSNE(n_components=2, random_state=42)
207
  tsne_values = tsne_model.fit_transform(topic_matrix)
208
  tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
209
  tsne_df["Dominant Topic"] = dominant_topic.astype(str)
210
+ fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
211
+ title="TSNE Embedding of Topics")
 
 
 
212
  st.plotly_chart(fig_topics)
213
  else:
214
  st.info("No text data available for topic embedding.")
215
 
216
+ # (b) GenAI Summary for Time Series Plot
 
 
217
  st.markdown("## GenAI Summary for Time Series")
218
  if timestamp_col in df.columns:
219
+ time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
220
+ if not time_series.empty:
221
+ start = time_series["created_utc"].min()
222
+ end = time_series["created_utc"].max()
223
+ avg_posts = time_series["count"].mean()
224
+ peak = time_series.loc[time_series["count"].idxmax()]
225
+ description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
226
+ f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
 
 
 
227
  st.write("Time Series Description:")
228
  st.write(description)
229
 
230
+ # Use a smaller, faster summarization model
231
+ ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
232
  try:
233
+ ts_summary = ts_summarizer(
234
+ description, max_length=80, min_length=40, do_sample=False
235
+ )[0]['summary_text']
236
  st.markdown("**GenAI Summary:**")
237
  st.write(ts_summary)
238
  except Exception as e:
239
+ st.error("Error generating time series summary.")
240
  else:
241
+ st.info("Time series data not available for summarization.")
242
  else:
243
  st.info("No timestamp column available for time series summary.")
244
 
245
+ # (c) Offline Events from Wikipedia
 
 
246
  st.markdown("## Offline Events from Wikipedia")
247
  wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
248
  if wiki_topic:
 
251
  st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
252
  st.write(wiki_summary)
253
  except Exception as e:
254
+ st.error("Error retrieving Wikipedia data. Please check the topic name.")
255
 
256
+ # (d) Semantic Search on Posts using Sentence Transformers
 
 
257
  st.markdown("## Semantic Search on Posts")
258
+ search_query = st.text_input("Enter your semantic search query:")
259
+ if search_query and text_col in df.columns:
260
+ @st.cache_data
261
+ def get_post_embeddings(texts):
262
+ # Use a smaller, faster model
263
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
264
+ return model.encode(texts, convert_to_tensor=True)
265
+
266
+ posts = df[text_col].dropna().tolist()
267
+ embeddings = get_post_embeddings(posts)
268
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
269
+ query_embedding = model.encode(search_query, convert_to_tensor=True)
270
+ cos_scores = util.cos_sim(query_embedding, embeddings)[0]
271
+ top_results = cos_scores.topk(5)
272
+
273
+ st.markdown("**Top Matching Posts:**")
274
+ for score, idx in zip(top_results.values, top_results.indices):
275
+ st.write(f"Score: {score.item():.3f}")
276
+ st.write(posts[idx])
277
+ st.write("---")
278
+
279
+ # (e) AI-Generated Summary of Posts
 
 
 
 
 
 
 
 
 
 
280
  st.markdown("## AI-Generated Summary of Posts")
281
  if text_col in df.columns:
282
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
283
 
284
  def generate_summary(text, summarizer, max_chunk_length=1000):
 
 
 
 
 
 
285
  chunks, current_chunk = [], ""
286
+ for sentence in text.split('. '):
 
287
  sentence = sentence.strip() + ". "
288
  if len(current_chunk) + len(sentence) <= max_chunk_length:
289
  current_chunk += sentence
 
293
  if current_chunk:
294
  chunks.append(current_chunk.strip())
295
 
296
+ summaries = []
 
297
  for chunk in chunks:
298
  if len(chunk) > 50:
299
+ summary_chunk = summarizer(
300
+ chunk, max_length=150, min_length=40, do_sample=False
301
+ )[0]['summary_text']
302
+ summaries.append(summary_chunk)
303
+ combined_summary = " ".join(summaries)
304
+ final_summary = summarizer(
305
+ combined_summary, max_length=150, min_length=40, do_sample=False
306
+ )[0]['summary_text']
307
  return final_summary
308
 
 
309
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
310
+ if sample_text:
311
  final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
312
  st.write(final_summary)
313
  else:
 
320
  # --------------------------------------------------------------------------------
321
  st.markdown("### End of Dashboard")
322
  st.markdown("""
323
+ This dashboard is a prototype implementation for analyzing Reddit social media data.
324
+ It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
 
 
 
 
 
 
 
 
 
325
  """)