Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -191,103 +191,96 @@ else:
|
|
191 |
st.info(f"No '{text_col}' column available for sentiment analysis.")
|
192 |
|
193 |
# --------------------------------------------------------------------------------
|
194 |
-
#
|
195 |
-
# Use sidebar checkboxes to toggle optional features
|
196 |
# --------------------------------------------------------------------------------
|
197 |
-
st.sidebar.markdown("### Optional Features")
|
198 |
-
show_topic_embedding = st.sidebar.checkbox("Topic Embedding Visualization")
|
199 |
-
show_ts_genai_summary = st.sidebar.checkbox("GenAI Summary for Time Series")
|
200 |
-
show_offline_events = st.sidebar.checkbox("Offline Events (Wikipedia)")
|
201 |
-
show_semantic_search = st.sidebar.checkbox("Semantic Search on Posts")
|
202 |
|
203 |
-
# ---------------------------------------------------------------------
|
204 |
# (a) Topic Embedding Visualization using LDA + TSNE
|
205 |
-
|
206 |
-
if
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
else:
|
223 |
-
st.info("No text data available for topic embedding.")
|
224 |
|
225 |
-
# ---------------------------------------------------------------------
|
226 |
# (b) GenAI Summary for Time Series Plot
|
227 |
-
|
228 |
-
if
|
229 |
-
|
230 |
if not time_series.empty:
|
231 |
-
start = time_series["
|
232 |
-
end = time_series["
|
233 |
avg_posts = time_series["count"].mean()
|
234 |
peak = time_series.loc[time_series["count"].idxmax()]
|
235 |
description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
|
236 |
-
f"The highest activity was on {peak['
|
237 |
st.write("Time Series Description:")
|
238 |
st.write(description)
|
239 |
-
|
|
|
|
|
240 |
try:
|
241 |
-
ts_summary = ts_summarizer(
|
|
|
|
|
242 |
st.markdown("**GenAI Summary:**")
|
243 |
st.write(ts_summary)
|
244 |
except Exception as e:
|
245 |
st.error("Error generating time series summary.")
|
246 |
else:
|
247 |
st.info("Time series data not available for summarization.")
|
|
|
|
|
248 |
|
249 |
-
#
|
250 |
-
|
251 |
-
|
252 |
-
if
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
st.
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
# ---------------------------------------------------------------------
|
286 |
-
# (Optional) AI-Generated Summary on Posts (Existing Feature)
|
287 |
-
# ---------------------------------------------------------------------
|
288 |
st.markdown("## AI-Generated Summary of Posts")
|
289 |
if text_col in df.columns:
|
290 |
-
summarizer = pipeline("summarization", model="
|
|
|
291 |
def generate_summary(text, summarizer, max_chunk_length=1000):
|
292 |
chunks, current_chunk = [], ""
|
293 |
for sentence in text.split('. '):
|
@@ -299,13 +292,18 @@ if text_col in df.columns:
|
|
299 |
current_chunk = sentence
|
300 |
if current_chunk:
|
301 |
chunks.append(current_chunk.strip())
|
|
|
302 |
summaries = []
|
303 |
for chunk in chunks:
|
304 |
if len(chunk) > 50:
|
305 |
-
summary_chunk = summarizer(
|
|
|
|
|
306 |
summaries.append(summary_chunk)
|
307 |
combined_summary = " ".join(summaries)
|
308 |
-
final_summary = summarizer(
|
|
|
|
|
309 |
return final_summary
|
310 |
|
311 |
sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
|
@@ -323,5 +321,5 @@ else:
|
|
323 |
st.markdown("### End of Dashboard")
|
324 |
st.markdown("""
|
325 |
This dashboard is a prototype implementation for analyzing Reddit social media data.
|
326 |
-
It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality
|
327 |
""")
|
|
|
191 |
st.info(f"No '{text_col}' column available for sentiment analysis.")
|
192 |
|
193 |
# --------------------------------------------------------------------------------
|
194 |
+
# ------------------------------ Additional Features -----------------------------
|
|
|
195 |
# --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
196 |
|
|
|
197 |
# (a) Topic Embedding Visualization using LDA + TSNE
|
198 |
+
st.markdown("## Topic Embedding Visualization")
|
199 |
+
if text_col in df.columns:
|
200 |
+
texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
|
201 |
+
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
|
202 |
+
X = vectorizer.fit_transform(texts)
|
203 |
+
lda = LatentDirichletAllocation(n_components=5, random_state=42)
|
204 |
+
topic_matrix = lda.fit_transform(X)
|
205 |
+
dominant_topic = topic_matrix.argmax(axis=1)
|
206 |
+
tsne_model = TSNE(n_components=2, random_state=42)
|
207 |
+
tsne_values = tsne_model.fit_transform(topic_matrix)
|
208 |
+
tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
|
209 |
+
tsne_df["Dominant Topic"] = dominant_topic.astype(str)
|
210 |
+
fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
|
211 |
+
title="TSNE Embedding of Topics")
|
212 |
+
st.plotly_chart(fig_topics)
|
213 |
+
else:
|
214 |
+
st.info("No text data available for topic embedding.")
|
|
|
|
|
215 |
|
|
|
216 |
# (b) GenAI Summary for Time Series Plot
|
217 |
+
st.markdown("## GenAI Summary for Time Series")
|
218 |
+
if timestamp_col in df.columns:
|
219 |
+
time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
|
220 |
if not time_series.empty:
|
221 |
+
start = time_series["created_utc"].min()
|
222 |
+
end = time_series["created_utc"].max()
|
223 |
avg_posts = time_series["count"].mean()
|
224 |
peak = time_series.loc[time_series["count"].idxmax()]
|
225 |
description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
|
226 |
+
f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
|
227 |
st.write("Time Series Description:")
|
228 |
st.write(description)
|
229 |
+
|
230 |
+
# Use a smaller, faster summarization model
|
231 |
+
ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
232 |
try:
|
233 |
+
ts_summary = ts_summarizer(
|
234 |
+
description, max_length=80, min_length=40, do_sample=False
|
235 |
+
)[0]['summary_text']
|
236 |
st.markdown("**GenAI Summary:**")
|
237 |
st.write(ts_summary)
|
238 |
except Exception as e:
|
239 |
st.error("Error generating time series summary.")
|
240 |
else:
|
241 |
st.info("Time series data not available for summarization.")
|
242 |
+
else:
|
243 |
+
st.info("No timestamp column available for time series summary.")
|
244 |
|
245 |
+
# (c) Offline Events from Wikipedia
|
246 |
+
st.markdown("## Offline Events from Wikipedia")
|
247 |
+
wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
|
248 |
+
if wiki_topic:
|
249 |
+
try:
|
250 |
+
wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
|
251 |
+
st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
|
252 |
+
st.write(wiki_summary)
|
253 |
+
except Exception as e:
|
254 |
+
st.error("Error retrieving Wikipedia data. Please check the topic name.")
|
255 |
+
|
256 |
+
# (d) Semantic Search on Posts using Sentence Transformers
|
257 |
+
st.markdown("## Semantic Search on Posts")
|
258 |
+
search_query = st.text_input("Enter your semantic search query:")
|
259 |
+
if search_query and text_col in df.columns:
|
260 |
+
@st.cache_data
|
261 |
+
def get_post_embeddings(texts):
|
262 |
+
# Use a smaller, faster model
|
263 |
+
model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
|
264 |
+
return model.encode(texts, convert_to_tensor=True)
|
265 |
+
|
266 |
+
posts = df[text_col].dropna().tolist()
|
267 |
+
embeddings = get_post_embeddings(posts)
|
268 |
+
model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
|
269 |
+
query_embedding = model.encode(search_query, convert_to_tensor=True)
|
270 |
+
cos_scores = util.cos_sim(query_embedding, embeddings)[0]
|
271 |
+
top_results = cos_scores.topk(5)
|
272 |
+
|
273 |
+
st.markdown("**Top Matching Posts:**")
|
274 |
+
for score, idx in zip(top_results.values, top_results.indices):
|
275 |
+
st.write(f"Score: {score.item():.3f}")
|
276 |
+
st.write(posts[idx])
|
277 |
+
st.write("---")
|
278 |
+
|
279 |
+
# (e) AI-Generated Summary of Posts
|
|
|
|
|
|
|
|
|
280 |
st.markdown("## AI-Generated Summary of Posts")
|
281 |
if text_col in df.columns:
|
282 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
283 |
+
|
284 |
def generate_summary(text, summarizer, max_chunk_length=1000):
|
285 |
chunks, current_chunk = [], ""
|
286 |
for sentence in text.split('. '):
|
|
|
292 |
current_chunk = sentence
|
293 |
if current_chunk:
|
294 |
chunks.append(current_chunk.strip())
|
295 |
+
|
296 |
summaries = []
|
297 |
for chunk in chunks:
|
298 |
if len(chunk) > 50:
|
299 |
+
summary_chunk = summarizer(
|
300 |
+
chunk, max_length=150, min_length=40, do_sample=False
|
301 |
+
)[0]['summary_text']
|
302 |
summaries.append(summary_chunk)
|
303 |
combined_summary = " ".join(summaries)
|
304 |
+
final_summary = summarizer(
|
305 |
+
combined_summary, max_length=150, min_length=40, do_sample=False
|
306 |
+
)[0]['summary_text']
|
307 |
return final_summary
|
308 |
|
309 |
sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
|
|
|
321 |
st.markdown("### End of Dashboard")
|
322 |
st.markdown("""
|
323 |
This dashboard is a prototype implementation for analyzing Reddit social media data.
|
324 |
+
It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
|
325 |
""")
|