kushalpatel0265 commited on
Commit
6d185a4
·
verified ·
1 Parent(s): 936ff44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -79
app.py CHANGED
@@ -191,103 +191,96 @@ else:
191
  st.info(f"No '{text_col}' column available for sentiment analysis.")
192
 
193
  # --------------------------------------------------------------------------------
194
- # ---------------------------- Optional Features ---------------------------------
195
- # Use sidebar checkboxes to toggle optional features
196
  # --------------------------------------------------------------------------------
197
- st.sidebar.markdown("### Optional Features")
198
- show_topic_embedding = st.sidebar.checkbox("Topic Embedding Visualization")
199
- show_ts_genai_summary = st.sidebar.checkbox("GenAI Summary for Time Series")
200
- show_offline_events = st.sidebar.checkbox("Offline Events (Wikipedia)")
201
- show_semantic_search = st.sidebar.checkbox("Semantic Search on Posts")
202
 
203
- # ---------------------------------------------------------------------
204
  # (a) Topic Embedding Visualization using LDA + TSNE
205
- # ---------------------------------------------------------------------
206
- if show_topic_embedding:
207
- st.markdown("## Topic Embedding Visualization")
208
- if text_col in df.columns:
209
- texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
210
- vectorizer = CountVectorizer(stop_words='english', max_features=1000)
211
- X = vectorizer.fit_transform(texts)
212
- lda = LatentDirichletAllocation(n_components=5, random_state=42)
213
- topic_matrix = lda.fit_transform(X)
214
- dominant_topic = topic_matrix.argmax(axis=1)
215
- tsne_model = TSNE(n_components=2, random_state=42)
216
- tsne_values = tsne_model.fit_transform(topic_matrix)
217
- tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
218
- tsne_df["Dominant Topic"] = dominant_topic.astype(str)
219
- fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
220
- title="TSNE Embedding of Topics")
221
- st.plotly_chart(fig_topics)
222
- else:
223
- st.info("No text data available for topic embedding.")
224
 
225
- # ---------------------------------------------------------------------
226
  # (b) GenAI Summary for Time Series Plot
227
- # ---------------------------------------------------------------------
228
- if show_ts_genai_summary:
229
- st.markdown("## GenAI Summary for Time Series")
230
  if not time_series.empty:
231
- start = time_series["date"].min()
232
- end = time_series["date"].max()
233
  avg_posts = time_series["count"].mean()
234
  peak = time_series.loc[time_series["count"].idxmax()]
235
  description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
236
- f"The highest activity was on {peak['date']} with {peak['count']} posts.")
237
  st.write("Time Series Description:")
238
  st.write(description)
239
- ts_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
240
  try:
241
- ts_summary = ts_summarizer(description, max_length=80, min_length=40, do_sample=False)[0]['summary_text']
 
 
242
  st.markdown("**GenAI Summary:**")
243
  st.write(ts_summary)
244
  except Exception as e:
245
  st.error("Error generating time series summary.")
246
  else:
247
  st.info("Time series data not available for summarization.")
 
 
248
 
249
- # ---------------------------------------------------------------------
250
- # (d) Offline Events from Wikipedia for a Given Topic
251
- # ---------------------------------------------------------------------
252
- if show_offline_events:
253
- st.markdown("## Offline Events from Wikipedia")
254
- wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
255
- if wiki_topic:
256
- try:
257
- wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
258
- st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
259
- st.write(wiki_summary)
260
- except Exception as e:
261
- st.error("Error retrieving Wikipedia data. Please check the topic name.")
262
-
263
- # ---------------------------------------------------------------------
264
- # (f) Semantic Search on Posts using Sentence Transformers
265
- # ---------------------------------------------------------------------
266
- if show_semantic_search:
267
- st.markdown("## Semantic Search on Posts")
268
- search_query = st.text_input("Enter your semantic search query:")
269
- if search_query and text_col in df.columns:
270
- @st.cache_data
271
- def get_post_embeddings(texts):
272
- model = SentenceTransformer("all-MiniLM-L6-v2")
273
- return model.encode(texts, convert_to_tensor=True)
274
- posts = df[text_col].dropna().tolist()
275
- embeddings = get_post_embeddings(posts)
276
- query_embedding = SentenceTransformer("all-MiniLM-L6-v2").encode(search_query, convert_to_tensor=True)
277
- cos_scores = util.cos_sim(query_embedding, embeddings)[0]
278
- top_results = cos_scores.topk(5)
279
- st.markdown("**Top Matching Posts:**")
280
- for score, idx in zip(top_results.values, top_results.indices):
281
- st.write(f"Score: {score.item():.3f}")
282
- st.write(posts[idx])
283
- st.write("---")
284
-
285
- # ---------------------------------------------------------------------
286
- # (Optional) AI-Generated Summary on Posts (Existing Feature)
287
- # ---------------------------------------------------------------------
288
  st.markdown("## AI-Generated Summary of Posts")
289
  if text_col in df.columns:
290
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
291
  def generate_summary(text, summarizer, max_chunk_length=1000):
292
  chunks, current_chunk = [], ""
293
  for sentence in text.split('. '):
@@ -299,13 +292,18 @@ if text_col in df.columns:
299
  current_chunk = sentence
300
  if current_chunk:
301
  chunks.append(current_chunk.strip())
 
302
  summaries = []
303
  for chunk in chunks:
304
  if len(chunk) > 50:
305
- summary_chunk = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
 
 
306
  summaries.append(summary_chunk)
307
  combined_summary = " ".join(summaries)
308
- final_summary = summarizer(combined_summary, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
 
 
309
  return final_summary
310
 
311
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
@@ -323,5 +321,5 @@ else:
323
  st.markdown("### End of Dashboard")
324
  st.markdown("""
325
  This dashboard is a prototype implementation for analyzing Reddit social media data.
326
- It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality.
327
  """)
 
191
  st.info(f"No '{text_col}' column available for sentiment analysis.")
192
 
193
  # --------------------------------------------------------------------------------
194
+ # ------------------------------ Additional Features -----------------------------
 
195
  # --------------------------------------------------------------------------------
 
 
 
 
 
196
 
 
197
  # (a) Topic Embedding Visualization using LDA + TSNE
198
+ st.markdown("## Topic Embedding Visualization")
199
+ if text_col in df.columns:
200
+ texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
201
+ vectorizer = CountVectorizer(stop_words='english', max_features=1000)
202
+ X = vectorizer.fit_transform(texts)
203
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
204
+ topic_matrix = lda.fit_transform(X)
205
+ dominant_topic = topic_matrix.argmax(axis=1)
206
+ tsne_model = TSNE(n_components=2, random_state=42)
207
+ tsne_values = tsne_model.fit_transform(topic_matrix)
208
+ tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
209
+ tsne_df["Dominant Topic"] = dominant_topic.astype(str)
210
+ fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
211
+ title="TSNE Embedding of Topics")
212
+ st.plotly_chart(fig_topics)
213
+ else:
214
+ st.info("No text data available for topic embedding.")
 
 
215
 
 
216
  # (b) GenAI Summary for Time Series Plot
217
+ st.markdown("## GenAI Summary for Time Series")
218
+ if timestamp_col in df.columns:
219
+ time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
220
  if not time_series.empty:
221
+ start = time_series["created_utc"].min()
222
+ end = time_series["created_utc"].max()
223
  avg_posts = time_series["count"].mean()
224
  peak = time_series.loc[time_series["count"].idxmax()]
225
  description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
226
+ f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
227
  st.write("Time Series Description:")
228
  st.write(description)
229
+
230
+ # Use a smaller, faster summarization model
231
+ ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
232
  try:
233
+ ts_summary = ts_summarizer(
234
+ description, max_length=80, min_length=40, do_sample=False
235
+ )[0]['summary_text']
236
  st.markdown("**GenAI Summary:**")
237
  st.write(ts_summary)
238
  except Exception as e:
239
  st.error("Error generating time series summary.")
240
  else:
241
  st.info("Time series data not available for summarization.")
242
+ else:
243
+ st.info("No timestamp column available for time series summary.")
244
 
245
+ # (c) Offline Events from Wikipedia
246
+ st.markdown("## Offline Events from Wikipedia")
247
+ wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
248
+ if wiki_topic:
249
+ try:
250
+ wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
251
+ st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
252
+ st.write(wiki_summary)
253
+ except Exception as e:
254
+ st.error("Error retrieving Wikipedia data. Please check the topic name.")
255
+
256
+ # (d) Semantic Search on Posts using Sentence Transformers
257
+ st.markdown("## Semantic Search on Posts")
258
+ search_query = st.text_input("Enter your semantic search query:")
259
+ if search_query and text_col in df.columns:
260
+ @st.cache_data
261
+ def get_post_embeddings(texts):
262
+ # Use a smaller, faster model
263
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
264
+ return model.encode(texts, convert_to_tensor=True)
265
+
266
+ posts = df[text_col].dropna().tolist()
267
+ embeddings = get_post_embeddings(posts)
268
+ model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
269
+ query_embedding = model.encode(search_query, convert_to_tensor=True)
270
+ cos_scores = util.cos_sim(query_embedding, embeddings)[0]
271
+ top_results = cos_scores.topk(5)
272
+
273
+ st.markdown("**Top Matching Posts:**")
274
+ for score, idx in zip(top_results.values, top_results.indices):
275
+ st.write(f"Score: {score.item():.3f}")
276
+ st.write(posts[idx])
277
+ st.write("---")
278
+
279
+ # (e) AI-Generated Summary of Posts
 
 
 
 
280
  st.markdown("## AI-Generated Summary of Posts")
281
  if text_col in df.columns:
282
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
283
+
284
  def generate_summary(text, summarizer, max_chunk_length=1000):
285
  chunks, current_chunk = [], ""
286
  for sentence in text.split('. '):
 
292
  current_chunk = sentence
293
  if current_chunk:
294
  chunks.append(current_chunk.strip())
295
+
296
  summaries = []
297
  for chunk in chunks:
298
  if len(chunk) > 50:
299
+ summary_chunk = summarizer(
300
+ chunk, max_length=150, min_length=40, do_sample=False
301
+ )[0]['summary_text']
302
  summaries.append(summary_chunk)
303
  combined_summary = " ".join(summaries)
304
+ final_summary = summarizer(
305
+ combined_summary, max_length=150, min_length=40, do_sample=False
306
+ )[0]['summary_text']
307
  return final_summary
308
 
309
  sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
 
321
  st.markdown("### End of Dashboard")
322
  st.markdown("""
323
  This dashboard is a prototype implementation for analyzing Reddit social media data.
324
+ It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
325
  """)