kushalpatel0265 commited on
Commit
5f7bd44
·
verified ·
1 Parent(s): 6aea2c4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +327 -0
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import os
5
+ import re
6
+ from datetime import datetime
7
+ from textblob import TextBlob
8
+ import networkx as nx
9
+ from pyvis.network import Network
10
+ import streamlit.components.v1 as components
11
+
12
+ # Transformers & Semantic Search
13
+ from transformers import pipeline
14
+ from sentence_transformers import SentenceTransformer, util
15
+ import wikipedia # For offline events summary
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+ from sklearn.decomposition import LatentDirichletAllocation
18
+ from sklearn.manifold import TSNE
19
+
20
+ # --------------------------------------------------------------------------------
21
+ # ----------------------- Data Loading and Normalization -------------------------
22
+ # --------------------------------------------------------------------------------
23
+ @st.cache_data
24
+ def load_raw_data(filepath):
25
+ """Load the newline-delimited JSON file into a Pandas DataFrame."""
26
+ try:
27
+ raw_df = pd.read_json(filepath, lines=True)
28
+ except ValueError as e:
29
+ st.error("Error reading the JSONL file. Please check the file format.")
30
+ raise e
31
+ return raw_df
32
+
33
+ DATA_PATH = "data.jsonl"
34
+ if not os.path.exists(DATA_PATH):
35
+ st.error("data.jsonl file not found. Please ensure it is in the same directory as this app.")
36
+ else:
37
+ raw_df = load_raw_data(DATA_PATH)
38
+
39
+ st.sidebar.markdown("### Raw Dataset Columns")
40
+ st.sidebar.write(raw_df.columns.tolist())
41
+
42
+ # Normalize the nested "data" column if present
43
+ if 'data' in raw_df.columns:
44
+ try:
45
+ df = pd.json_normalize(raw_df['data'])
46
+ except Exception as e:
47
+ st.error("Error normalizing the 'data' column.")
48
+ df = raw_df
49
+ else:
50
+ df = raw_df
51
+
52
+ st.sidebar.markdown("### Normalized Data Columns")
53
+ st.sidebar.write(df.columns.tolist())
54
+
55
+ # --------------------------------------------------------------------------------
56
+ # ------------------------- Column Mapping (Reddit Data) ---------------------------
57
+ # --------------------------------------------------------------------------------
58
+ # Typical Reddit fields:
59
+ timestamp_col = "created_utc" # Unix timestamp (in seconds)
60
+ user_col = "author" # Author
61
+
62
+ # For text, prefer "selftext" if available; otherwise, use "title".
63
+ if "selftext" in df.columns and df["selftext"].notnull().sum() > 0:
64
+ text_col = "selftext"
65
+ elif "title" in df.columns:
66
+ text_col = "title"
67
+ else:
68
+ text_col = None
69
+
70
+ # For hashtags: if not provided, extract from text using regex.
71
+ if "hashtags" not in df.columns:
72
+ def extract_hashtags(row):
73
+ text = ""
74
+ if "title" in row and pd.notnull(row["title"]):
75
+ text += row["title"] + " "
76
+ if "selftext" in row and pd.notnull(row["selftext"]):
77
+ text += row["selftext"]
78
+ return re.findall(r"#\w+", text)
79
+ df["hashtags"] = df.apply(extract_hashtags, axis=1)
80
+ hashtags_col = "hashtags"
81
+
82
+ # Convert Unix timestamp to datetime if available
83
+ if timestamp_col in df.columns:
84
+ try:
85
+ df[timestamp_col] = pd.to_datetime(df[timestamp_col], unit='s')
86
+ except Exception as e:
87
+ st.error(f"Error converting timestamp. Check the format of '{timestamp_col}'.")
88
+
89
+ # --------------------------------------------------------------------------------
90
+ # --------------------------- Sidebar: Filters & Platform --------------------------
91
+ # --------------------------------------------------------------------------------
92
+ st.sidebar.header("Filters & Platform")
93
+
94
+ # Platform Selector (simulate multiple platforms)
95
+ platform = st.sidebar.selectbox("Select Platform", ["Reddit", "Twitter", "Facebook"])
96
+ if platform != "Reddit":
97
+ st.sidebar.info(f"Data for {platform} is not available. Showing Reddit data.")
98
+
99
+ # Date Filter
100
+ if timestamp_col in df.columns:
101
+ try:
102
+ min_date = df[timestamp_col].min().date()
103
+ max_date = df[timestamp_col].max().date()
104
+ start_date = st.sidebar.date_input("Start date", min_date, min_value=min_date, max_value=max_date)
105
+ end_date = st.sidebar.date_input("End date", max_date, min_value=min_date, max_value=max_date)
106
+ if start_date > end_date:
107
+ st.sidebar.error("Error: End date must fall after start date.")
108
+ df = df[(df[timestamp_col].dt.date >= start_date) & (df[timestamp_col].dt.date <= end_date)]
109
+ except Exception as e:
110
+ st.sidebar.error("Error processing the timestamp column for filtering.")
111
+ else:
112
+ st.sidebar.info(f"No '{timestamp_col}' column found for filtering by date.")
113
+
114
+ # Keyword/Hashtag Search
115
+ search_term = st.sidebar.text_input("Search for a keyword/hashtag:")
116
+ if search_term:
117
+ if text_col in df.columns:
118
+ df = df[df[text_col].str.contains(search_term, case=False, na=False)]
119
+ st.sidebar.markdown(f"### Showing results for '{search_term}'")
120
+
121
+ # --------------------------------------------------------------------------------
122
+ # ------------------------- Main Dashboard: Basic Visualizations -----------------
123
+ # --------------------------------------------------------------------------------
124
+ st.title("Social Media Data Analysis Dashboard")
125
+ st.markdown("""
126
+ This dashboard visualizes Reddit data, showcasing trends over time, key contributors, topic embeddings, and more.
127
+ """)
128
+
129
+ # Summary Metrics
130
+ total_posts = len(df)
131
+ st.markdown("### Summary Metrics")
132
+ st.write("**Total Posts:**", total_posts)
133
+ if user_col in df.columns:
134
+ unique_users = df[user_col].nunique()
135
+ st.write("**Unique Users:**", unique_users)
136
+ else:
137
+ st.write("**Unique Users:** Data not available")
138
+
139
+ # Time Series Plot with 7-day Moving Average
140
+ if timestamp_col in df.columns:
141
+ st.markdown("### Posts Over Time with Moving Average")
142
+ df["date"] = df[timestamp_col].dt.date
143
+ time_series = df.groupby("date").size().reset_index(name="count")
144
+ time_series["7-day Moving Avg"] = time_series["count"].rolling(window=7).mean()
145
+ fig_time = px.line(time_series, x="date", y=["count", "7-day Moving Avg"],
146
+ labels={"date": "Date", "value": "Number of Posts"},
147
+ title="Posts Over Time with 7-day Moving Average")
148
+ st.plotly_chart(fig_time)
149
+ else:
150
+ st.info("No timestamp data available for time series plot.")
151
+
152
+ # Pie Chart of Top Contributors (using subreddit if available, otherwise author)
153
+ community_col = "subreddit" if "subreddit" in df.columns else user_col
154
+ if community_col in df.columns:
155
+ st.markdown("### Top Communities/Accounts Contributions")
156
+ contributions = df[community_col].value_counts().reset_index()
157
+ contributions.columns = [community_col, "count"]
158
+ top_contributions = contributions.head(10)
159
+ fig_pie = px.pie(top_contributions, values="count", names=community_col,
160
+ title="Top 10 Contributors")
161
+ st.plotly_chart(fig_pie)
162
+ else:
163
+ st.info("No community or account data available for contributor pie chart.")
164
+
165
+ # Top Hashtags Bar Chart
166
+ if hashtags_col in df.columns:
167
+ st.markdown("### Top Hashtags")
168
+ hashtags_exploded = df.explode(hashtags_col)
169
+ hashtags_exploded = hashtags_exploded[hashtags_exploded[hashtags_col] != ""]
170
+ top_hashtags = hashtags_exploded[hashtags_col].value_counts().reset_index()
171
+ top_hashtags.columns = ['hashtag', 'count']
172
+ if not top_hashtags.empty:
173
+ fig_hashtags = px.bar(top_hashtags.head(10), x='hashtag', y='count',
174
+ labels={'hashtag': 'Hashtag', 'count': 'Frequency'},
175
+ title="Top 10 Hashtags")
176
+ st.plotly_chart(fig_hashtags)
177
+ else:
178
+ st.info("No hashtag data available.")
179
+ else:
180
+ st.info("No 'hashtags' column found in the dataset.")
181
+
182
+ # Sentiment Analysis on Text Data
183
+ if text_col is not None and text_col in df.columns:
184
+ st.markdown("### Sentiment Analysis")
185
+ df['sentiment'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0)
186
+ fig_sentiment = px.histogram(df, x='sentiment', nbins=30,
187
+ labels={'sentiment': 'Sentiment Polarity'},
188
+ title="Sentiment Polarity Distribution")
189
+ st.plotly_chart(fig_sentiment)
190
+ else:
191
+ st.info(f"No '{text_col}' column available for sentiment analysis.")
192
+
193
+ # --------------------------------------------------------------------------------
194
+ # ---------------------------- Optional Features ---------------------------------
195
+ # Use sidebar checkboxes to toggle optional features
196
+ # --------------------------------------------------------------------------------
197
+ st.sidebar.markdown("### Optional Features")
198
+ show_topic_embedding = st.sidebar.checkbox("Topic Embedding Visualization")
199
+ show_ts_genai_summary = st.sidebar.checkbox("GenAI Summary for Time Series")
200
+ show_offline_events = st.sidebar.checkbox("Offline Events (Wikipedia)")
201
+ show_semantic_search = st.sidebar.checkbox("Semantic Search on Posts")
202
+
203
+ # ---------------------------------------------------------------------
204
+ # (a) Topic Embedding Visualization using LDA + TSNE
205
+ # ---------------------------------------------------------------------
206
+ if show_topic_embedding:
207
+ st.markdown("## Topic Embedding Visualization")
208
+ if text_col in df.columns:
209
+ texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
210
+ vectorizer = CountVectorizer(stop_words='english', max_features=1000)
211
+ X = vectorizer.fit_transform(texts)
212
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
213
+ topic_matrix = lda.fit_transform(X)
214
+ dominant_topic = topic_matrix.argmax(axis=1)
215
+ tsne_model = TSNE(n_components=2, random_state=42)
216
+ tsne_values = tsne_model.fit_transform(topic_matrix)
217
+ tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
218
+ tsne_df["Dominant Topic"] = dominant_topic.astype(str)
219
+ fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
220
+ title="TSNE Embedding of Topics")
221
+ st.plotly_chart(fig_topics)
222
+ else:
223
+ st.info("No text data available for topic embedding.")
224
+
225
+ # ---------------------------------------------------------------------
226
+ # (b) GenAI Summary for Time Series Plot
227
+ # ---------------------------------------------------------------------
228
+ if show_ts_genai_summary:
229
+ st.markdown("## GenAI Summary for Time Series")
230
+ if not time_series.empty:
231
+ start = time_series["date"].min()
232
+ end = time_series["date"].max()
233
+ avg_posts = time_series["count"].mean()
234
+ peak = time_series.loc[time_series["count"].idxmax()]
235
+ description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
236
+ f"The highest activity was on {peak['date']} with {peak['count']} posts.")
237
+ st.write("Time Series Description:")
238
+ st.write(description)
239
+ ts_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
240
+ try:
241
+ ts_summary = ts_summarizer(description, max_length=80, min_length=40, do_sample=False)[0]['summary_text']
242
+ st.markdown("**GenAI Summary:**")
243
+ st.write(ts_summary)
244
+ except Exception as e:
245
+ st.error("Error generating time series summary.")
246
+ else:
247
+ st.info("Time series data not available for summarization.")
248
+
249
+ # ---------------------------------------------------------------------
250
+ # (d) Offline Events from Wikipedia for a Given Topic
251
+ # ---------------------------------------------------------------------
252
+ if show_offline_events:
253
+ st.markdown("## Offline Events from Wikipedia")
254
+ wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
255
+ if wiki_topic:
256
+ try:
257
+ wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
258
+ st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
259
+ st.write(wiki_summary)
260
+ except Exception as e:
261
+ st.error("Error retrieving Wikipedia data. Please check the topic name.")
262
+
263
+ # ---------------------------------------------------------------------
264
+ # (f) Semantic Search on Posts using Sentence Transformers
265
+ # ---------------------------------------------------------------------
266
+ if show_semantic_search:
267
+ st.markdown("## Semantic Search on Posts")
268
+ search_query = st.text_input("Enter your semantic search query:")
269
+ if search_query and text_col in df.columns:
270
+ @st.cache_data
271
+ def get_post_embeddings(texts):
272
+ model = SentenceTransformer("all-MiniLM-L6-v2")
273
+ return model.encode(texts, convert_to_tensor=True)
274
+ posts = df[text_col].dropna().tolist()
275
+ embeddings = get_post_embeddings(posts)
276
+ query_embedding = SentenceTransformer("all-MiniLM-L6-v2").encode(search_query, convert_to_tensor=True)
277
+ cos_scores = util.cos_sim(query_embedding, embeddings)[0]
278
+ top_results = cos_scores.topk(5)
279
+ st.markdown("**Top Matching Posts:**")
280
+ for score, idx in zip(top_results.values, top_results.indices):
281
+ st.write(f"Score: {score.item():.3f}")
282
+ st.write(posts[idx])
283
+ st.write("---")
284
+
285
+ # ---------------------------------------------------------------------
286
+ # (Optional) AI-Generated Summary on Posts (Existing Feature)
287
+ # ---------------------------------------------------------------------
288
+ st.markdown("## AI-Generated Summary of Posts")
289
+ if text_col in df.columns:
290
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
291
+ def generate_summary(text, summarizer, max_chunk_length=1000):
292
+ chunks, current_chunk = [], ""
293
+ for sentence in text.split('. '):
294
+ sentence = sentence.strip() + ". "
295
+ if len(current_chunk) + len(sentence) <= max_chunk_length:
296
+ current_chunk += sentence
297
+ else:
298
+ chunks.append(current_chunk.strip())
299
+ current_chunk = sentence
300
+ if current_chunk:
301
+ chunks.append(current_chunk.strip())
302
+ summaries = []
303
+ for chunk in chunks:
304
+ if len(chunk) > 50:
305
+ summary_chunk = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
306
+ summaries.append(summary_chunk)
307
+ combined_summary = " ".join(summaries)
308
+ final_summary = summarizer(combined_summary, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
309
+ return final_summary
310
+
311
+ sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
312
+ if sample_text:
313
+ final_summary = generate_summary(sample_text, summarizer, max_chunk_length=1000)
314
+ st.write(final_summary)
315
+ else:
316
+ st.info("Not enough text data available for summarization.")
317
+ else:
318
+ st.info("No text data available for AI summarization.")
319
+
320
+ # --------------------------------------------------------------------------------
321
+ # ------------------------------- End of Dashboard -------------------------------
322
+ # --------------------------------------------------------------------------------
323
+ st.markdown("### End of Dashboard")
324
+ st.markdown("""
325
+ This dashboard is a prototype implementation for analyzing Reddit social media data.
326
+ It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality.
327
+ """)