Ippo987 commited on
Commit
fa9cb80
·
verified ·
1 Parent(s): 52897d7

Update TrendAnalysis.py

Browse files
Files changed (1) hide show
  1. TrendAnalysis.py +1044 -1044
TrendAnalysis.py CHANGED
@@ -1,1044 +1,1044 @@
1
- from motor.motor_asyncio import AsyncIOMotorClient
2
- import pandas as pd
3
- import numpy as np
4
- import re
5
- import json
6
- import umap
7
- import plotly.io as pio
8
- import hdbscan
9
- from bertopic import BERTopic
10
- from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
11
- from skopt import gp_minimize
12
- from sentence_transformers import SentenceTransformer
13
- import torch
14
- import random
15
- import multiprocessing
16
- from sklearn.feature_extraction.text import CountVectorizer
17
- from bertopic.vectorizers import ClassTfidfTransformer
18
- from bertopic.representation import KeyBERTInspired
19
- import optuna
20
- import pandas as pd
21
- import dash
22
- from dash import dcc, html, Input, Output, State
23
- import plotly.graph_objects as go
24
- import plotly.express as px
25
- import numpy as np
26
- import dash_bootstrap_components as dbc
27
- from fastapi import HTTPException, APIRouter, Request
28
- from pydantic import BaseModel
29
- import threading
30
- import time
31
- import webbrowser
32
- import asyncio
33
-
34
-
35
- # Set seed for reproducibility
36
- def set_seed(seed=42):
37
- random.seed(seed)
38
- np.random.seed(seed)
39
- torch.manual_seed(seed)
40
- torch.cuda.manual_seed_all(seed)
41
- torch.backends.cudnn.deterministic = True
42
- torch.backends.cudnn.benchmark = False
43
-
44
-
45
- if __name__ == "__main__":
46
- set_seed(42)
47
- multiprocessing.freeze_support()
48
-
49
- global TitleName
50
- TitleName = "Dashboard"
51
- router = APIRouter()
52
-
53
-
54
- class TrendAnalysisRequest(BaseModel):
55
- userId: str
56
- topic: str
57
- year: str = None
58
- page: int = 0
59
-
60
-
61
- async def fetch_papers_with_pagination(request: Request, userId: str, topic: str, year: str = None, page: int = 0):
62
- # Build the query filter
63
- query_filter = {"userId": userId, "topic": topic}
64
- if year:
65
- query_filter["year"] = year
66
-
67
- # Count total matching documents
68
- count_pipeline = [
69
- {"$match": query_filter},
70
- {"$unwind": "$papers"},
71
- {"$count": "total_papers"}
72
- ]
73
- collection = request.app.state.collection
74
- count_result = await collection.aggregate(count_pipeline).to_list(length=1)
75
- total_papers = count_result[0]['total_papers'] if count_result else 0
76
-
77
- print(f"Total papers matching criteria: {total_papers}")
78
-
79
- # If no papers found, return empty result
80
- if total_papers == 0:
81
- return pd.DataFrame(), 0, 0, 0, 0
82
-
83
- # Define pagination constants
84
- papers_per_page = 200
85
- min_papers_last_page = 50
86
-
87
- # Calculate basic pagination
88
- if total_papers <= papers_per_page:
89
- # Simple case: all papers fit in one page
90
- total_pages = 1
91
- else:
92
- # Multiple pages case
93
- full_pages = total_papers // papers_per_page
94
- remaining = total_papers % papers_per_page
95
-
96
- if remaining >= min_papers_last_page:
97
- # If remaining papers meet minimum threshold, create a separate page
98
- total_pages = full_pages + 1
99
- else:
100
- # Otherwise, we'll have exactly 'full_pages' pages
101
- # The remaining papers will be added to the last page
102
- total_pages = full_pages
103
-
104
- # Ensure page is within valid range
105
- if page >= total_pages:
106
- return pd.DataFrame(), 0, total_pages, 0, total_papers
107
-
108
- # Calculate skip and limit based on page number
109
- if total_pages == 1:
110
- # Only one page - return all papers
111
- skip = 0
112
- limit = total_papers
113
- elif page < total_pages - 1:
114
- # Regular full page
115
- skip = page * papers_per_page
116
- limit = papers_per_page
117
- else:
118
- # Last page - might include remaining papers
119
- remaining = total_papers % papers_per_page
120
-
121
- if remaining >= min_papers_last_page or remaining == 0:
122
- # Last page with either enough remaining papers or perfectly divided
123
- skip = page * papers_per_page
124
- limit = remaining if remaining > 0 else papers_per_page
125
- else:
126
- # Last page with remaining papers that don't meet minimum threshold
127
- # We distribute by adding them to the last page
128
- skip = (total_pages - 1) * papers_per_page
129
- limit = papers_per_page + remaining
130
-
131
- print(f"Pagination: Page {page + 1} of {total_pages}, Skip {skip}, Limit {limit}")
132
-
133
- # MongoDB aggregation pipeline
134
- pipeline = [
135
- {"$match": query_filter},
136
- {"$unwind": "$papers"},
137
- {"$replaceRoot": {"newRoot": "$papers"}},
138
- {"$project": {
139
- "_id": 0,
140
- "paperId": 1,
141
- "url": 1,
142
- "title": 1,
143
- "abstract": 1,
144
- "citationCount": 1,
145
- "influentialCitationCount": 1,
146
- "embedding": 1,
147
- "publicationDate": 1,
148
- "authors": 1
149
- }},
150
- {"$sort": {"publicationDate": 1}},
151
- {"$skip": skip},
152
- {"$limit": limit}
153
- ]
154
-
155
- # Execute the aggregation pipeline
156
- cursor = collection.aggregate(pipeline)
157
- papers = await cursor.to_list(None)
158
-
159
- papers_count = len(papers)
160
- print(f"Papers Retrieved: {papers_count}")
161
-
162
- # Convert to DataFrame
163
- df = pd.DataFrame(papers)
164
- df = df.sort_values(by="publicationDate")
165
- print(df[["paperId", "publicationDate"]].head(10))
166
-
167
- return df, page, total_pages, papers_count, total_papers
168
-
169
-
170
- # Preprocessing function
171
- def clean_text(text):
172
- text = str(text).lower()
173
- text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
174
- return ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
175
-
176
-
177
- # Adaptive clustering and topic modeling
178
- def perform_trend_analysis(df):
179
- # Convert embeddings
180
- def convert_embedding(embedding):
181
- return np.array(embedding["vector"], dtype=np.float64) if isinstance(embedding,
182
- dict) and "vector" in embedding else None
183
-
184
- df["embedding"] = df["embedding"].apply(convert_embedding)
185
- df = df.dropna(subset=["embedding"])
186
-
187
- if df.empty:
188
- return df, {}
189
-
190
- df["clean_text"] = (df["abstract"].fillna("")).apply(clean_text)
191
-
192
- def objective(trial):
193
- umap_n_components = trial.suggest_int("umap_n_components", 1, 12)
194
- umap_min_dist = trial.suggest_float("umap_min_dist", 0.1, 0.8)
195
- umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 2, 12)
196
- hdbscan_min_cluster_size = trial.suggest_int("hdbscan_min_cluster_size", 2, 10)
197
- hdbscan_min_samples = trial.suggest_int("hdbscan_min_samples", 1, 10)
198
- hdbscan_cluster_selection_epsilon = trial.suggest_float("hdbscan_cluster_selection_epsilon", 0.2, 0.8)
199
- hdbscan_cluster_selection_method = trial.suggest_categorical("hdbscan_cluster_selection_method",
200
- ["eom", "leaf"])
201
-
202
- reducer_high_dim = umap.UMAP(
203
- n_components=umap_n_components,
204
- random_state=42,
205
- min_dist=umap_min_dist,
206
- n_neighbors=umap_n_neighbors,
207
- metric="cosine"
208
- )
209
- reduced_embeddings_high_dim = reducer_high_dim.fit_transform(np.vstack(df["embedding"].values)).astype(
210
- np.float64)
211
-
212
- clusterer = hdbscan.HDBSCAN(
213
- min_cluster_size=hdbscan_min_cluster_size,
214
- min_samples=hdbscan_min_samples,
215
- cluster_selection_epsilon=hdbscan_cluster_selection_epsilon,
216
- cluster_selection_method=hdbscan_cluster_selection_method,
217
- prediction_data=True,
218
- core_dist_n_jobs=1
219
- )
220
- labels = clusterer.fit_predict(reduced_embeddings_high_dim)
221
-
222
- if len(set(labels)) > 1:
223
- dbcv_score = hdbscan.validity.validity_index(reduced_embeddings_high_dim, labels)
224
- else:
225
- dbcv_score = -np.inf
226
-
227
- return dbcv_score
228
-
229
- study = optuna.create_study(
230
- direction="maximize",
231
- sampler=optuna.samplers.TPESampler(seed=42))
232
- study.optimize(objective, n_trials=100)
233
-
234
- best_params = study.best_params
235
- umap_model = umap.UMAP(
236
- n_components=best_params["umap_n_components"],
237
- random_state=42,
238
- min_dist=best_params["umap_min_dist"],
239
- n_neighbors=best_params["umap_n_neighbors"],
240
- metric="cosine"
241
- )
242
- hdbscan_model = hdbscan.HDBSCAN(
243
- min_cluster_size=best_params["hdbscan_min_cluster_size"],
244
- min_samples=best_params["hdbscan_min_samples"],
245
- cluster_selection_epsilon=best_params["hdbscan_cluster_selection_epsilon"],
246
- cluster_selection_method=best_params["hdbscan_cluster_selection_method"],
247
- prediction_data=True,
248
- core_dist_n_jobs=1
249
- )
250
-
251
- vectorizer = CountVectorizer(
252
- stop_words=list(ENGLISH_STOP_WORDS),
253
- ngram_range=(2, 3)
254
- )
255
-
256
- representation_model = KeyBERTInspired()
257
- embedding_model = SentenceTransformer("allenai/specter")
258
- topic_model = BERTopic(
259
- vectorizer_model=vectorizer,
260
- umap_model=umap_model,
261
- hdbscan_model=hdbscan_model,
262
- embedding_model=embedding_model,
263
- nr_topics='auto',
264
- top_n_words=8,
265
- representation_model=representation_model,
266
- ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)
267
- )
268
-
269
- topics, _ = topic_model.fit_transform(df["clean_text"], np.vstack(df["embedding"].values))
270
- df["topic"] = topics
271
- topic_labels = {t: " | ".join([word for word, _ in topic_model.get_topic(t)][:8]) for t in set(topics)}
272
-
273
- reduced_embeddings_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(
274
- np.vstack(df["embedding"].values)).astype(np.float64)
275
- df["x"] = reduced_embeddings_2d[:, 0]
276
- df["y"] = reduced_embeddings_2d[:, 1]
277
- df["topic_label"] = df["topic"].map(topic_labels)
278
-
279
- return df, topic_labels
280
-
281
-
282
- def build_dashboard(df, titleNm, topic_year):
283
- TitleName = titleNm + "_" + topic_year
284
- color_palette = px.colors.qualitative.Vivid
285
- unique_topics = sorted(df["topic"].unique())
286
- color_map = {topic: color_palette[i % len(color_palette)] for i, topic in enumerate(unique_topics)}
287
-
288
- # Map colors to topics
289
- df["color"] = df["topic"].map(color_map)
290
-
291
- # Calculate the number of papers in each cluster
292
- cluster_sizes = df.groupby("topic").size().reset_index(name="paper_count")
293
- df = df.merge(cluster_sizes, on="topic", how="left")
294
-
295
- # Improved marker scaling with a better range
296
- min_size = 50
297
- max_size = 140
298
- df["marker_size"] = ((df["paper_count"] - df["paper_count"].min()) /
299
- (df["paper_count"].max() - df["paper_count"].min())) * (max_size - min_size) + min_size
300
-
301
- # Add log-transformed citation and influence columns
302
- df["log_citation"] = np.log1p(df["citationCount"])
303
- df["log_influence"] = np.log1p(df["influentialCitationCount"])
304
-
305
- # Bayesian shrinkage for citations and influence
306
- global_median_citation = df["log_citation"].median()
307
- global_median_influence = df["log_influence"].median()
308
- C = 10 # Shrinkage constant
309
-
310
- def bayesian_shrinkage(group, global_median, C):
311
- return (group.sum() + C * global_median) / (len(group) + C)
312
-
313
- adjusted_citations = df.groupby("topic")["log_citation"].apply(
314
- lambda x: bayesian_shrinkage(x, global_median_citation, C))
315
- adjusted_influence = df.groupby("topic")["log_influence"].apply(
316
- lambda x: bayesian_shrinkage(x, global_median_influence, C))
317
-
318
- # Merge adjusted metrics back into the dataframe
319
- df = df.merge(adjusted_citations.rename("adjusted_citation"), on="topic")
320
- df = df.merge(adjusted_influence.rename("adjusted_influence"), on="topic")
321
-
322
- # Calculate global percentiles for thresholds
323
- citation_25th = df["adjusted_citation"].quantile(0.25)
324
- citation_75th = df["adjusted_citation"].quantile(0.75)
325
- influence_25th = df["adjusted_influence"].quantile(0.25)
326
- influence_75th = df["adjusted_influence"].quantile(0.75)
327
-
328
- # Enhanced theme classification with more distinct emojis
329
- def classify_theme(row):
330
- if row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] >= influence_75th:
331
- return "🔥 Hot Topic"
332
- elif row["adjusted_citation"] <= citation_25th and row["adjusted_influence"] >= influence_75th:
333
- return "💎 Gap Opportunity"
334
- elif row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] <= influence_25th:
335
- return "⚠️ Risky Theme"
336
- else:
337
- return "🔄 Neutral"
338
-
339
- df["theme"] = df.apply(classify_theme, axis=1)
340
-
341
- # Initialize the Dash app with an improved Bootstrap theme
342
- app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY]) # DARKLY for a sleek dark theme
343
-
344
- # Create a more visually appealing figure
345
- fig = go.Figure()
346
-
347
- # Add subtle grid lines for reference
348
- fig.update_xaxes(
349
- showgrid=True,
350
- gridwidth=0.1,
351
- gridcolor='rgba(255, 255, 255, 0.05)',
352
- zeroline=False
353
- )
354
- fig.update_yaxes(
355
- showgrid=True,
356
- gridwidth=0.1,
357
- gridcolor='rgba(255, 255, 255, 0.05)',
358
- zeroline=False
359
- )
360
-
361
- for topic in unique_topics:
362
- topic_data = df[df["topic"] == topic]
363
-
364
- # Get cluster center
365
- center_x = topic_data["x"].mean()
366
- center_y = topic_data["y"].mean()
367
-
368
- # Get label
369
- full_topic_formatted = topic_data['topic_label'].iloc[
370
- 0] if 'topic_label' in topic_data.columns else f"Cluster {topic}"
371
-
372
- # Add a subtle glow effect with a larger outer circle
373
- fig.add_trace(
374
- go.Scatter(
375
- x=[center_x],
376
- y=[center_y],
377
- mode="markers",
378
- marker=dict(
379
- color=color_map[topic],
380
- size=topic_data["marker_size"].iloc[0] * 1.2, # Slightly larger for glow effect
381
- opacity=0.3,
382
- line=dict(width=0),
383
- symbol="circle",
384
- ),
385
- showlegend=False,
386
- hoverinfo="none",
387
- )
388
- )
389
-
390
- # Add main cluster circle with enhanced styling
391
- fig.add_trace(
392
- go.Scatter(
393
- x=[center_x],
394
- y=[center_y],
395
- mode="markers+text",
396
- marker=dict(
397
- color=color_map[topic],
398
- size=topic_data["marker_size"].iloc[0],
399
- opacity=0.85,
400
- line=dict(width=2, color="white"),
401
- symbol="circle",
402
- ),
403
- text=[f"{topic}"],
404
- textposition="middle center",
405
- textfont=dict(
406
- family="Arial Black",
407
- size=16,
408
- color="white"
409
- ),
410
- name=f"{topic}",
411
- hovertemplate=(
412
- "<b>Cluster ID:</b> %{text}<br>" +
413
- "<b>Name:</b><br>" + full_topic_formatted + "<br>" +
414
- "<b>Papers:</b> " + str(topic_data["paper_count"].iloc[0]) + "<br>" +
415
- "<b>Popularity:</b> " + (
416
- "🔼 High" if topic_data["adjusted_citation"].iloc[0] >= citation_75th else "🔽 Low") +
417
- f" (Adjusted Citation: {topic_data['adjusted_citation'].iloc[0]:.2f})<br>" +
418
- "<b>Impactfulness:</b> " + (
419
- "🔼 High" if topic_data["adjusted_influence"].iloc[0] >= influence_75th else "🔽 Low") +
420
- f" (Adjusted Influence: {topic_data['adjusted_influence'].iloc[0]:.2f})<br>" +
421
- "<b>Theme:</b> " + topic_data["theme"].iloc[0] +
422
- "<extra></extra>"
423
- ),
424
- customdata=[[topic]],
425
- )
426
- )
427
-
428
- # Add an aesthetic background with gradient
429
- fig.update_layout(
430
- shapes=[
431
- # Improved gradient background
432
- dict(
433
- type="rect",
434
- xref="paper",
435
- yref="paper",
436
- x0=0,
437
- y0=0,
438
- x1=1,
439
- y1=1,
440
- fillcolor="rgba(0, 0, 40, 0.95)",
441
- line_width=0,
442
- layer="below"
443
- ),
444
- # Add a subtle radial gradient effect
445
- dict(
446
- type="circle",
447
- xref="paper",
448
- yref="paper",
449
- x0=0.3,
450
- y0=0.3,
451
- x1=0.7,
452
- y1=0.7,
453
- fillcolor="rgba(50, 50, 120, 0.2)",
454
- line_width=0,
455
- layer="below"
456
- )
457
- ],
458
- template="plotly_dark",
459
- title={
460
- 'text': f"<b>{TitleName.title()}</b>",
461
- 'y': 0.97,
462
- 'x': 0.5,
463
- 'xanchor': 'center',
464
- 'yanchor': 'top',
465
- 'font': dict(
466
- family="Arial Black",
467
- size=28,
468
- color="white",
469
- ),
470
- 'xref': 'paper',
471
- 'yref': 'paper',
472
- },
473
- margin=dict(l=40, r=40, b=150, t=100),
474
- hovermode="closest",
475
- xaxis=dict(showticklabels=False),
476
- yaxis=dict(showticklabels=False),
477
- paper_bgcolor="rgba(0,0,0,0)",
478
- plot_bgcolor="rgba(0,0,0,0)",
479
- dragmode="pan",
480
- legend=dict(
481
- orientation="h",
482
- yanchor="bottom",
483
- y=-0.15,
484
- xanchor="center",
485
- x=0.5,
486
- bgcolor="rgba(30,30,60,0.5)",
487
- bordercolor="rgba(255,255,255,0.2)",
488
- borderwidth=1
489
- ),
490
- )
491
-
492
- # Add subtle animation options
493
- fig.update_layout(
494
- updatemenus=[
495
- dict(
496
- type="buttons",
497
- showactive=False,
498
- buttons=[
499
- dict(
500
- label="Reset View",
501
- method="relayout",
502
- args=[{"xaxis.range": None, "yaxis.range": None}]
503
- ),
504
- ],
505
- x=0.05,
506
- y=0.05,
507
- xanchor="left",
508
- yanchor="bottom",
509
- bgcolor="rgba(50,50,80,0.7)",
510
- bordercolor="rgba(255,255,255,0.2)",
511
- )
512
- ]
513
- )
514
-
515
- # Enhanced app layout with modern design elements
516
- app.layout = dbc.Container(
517
- fluid=True,
518
- style={
519
- "backgroundColor": "#111122",
520
- "minHeight": "100vh",
521
- "height": "100%",
522
- "width": "100%",
523
- "backgroundImage": "linear-gradient(135deg, #111122 0%, #15162c 100%)",
524
- "padding": "20px"
525
- },
526
- children=[
527
- dbc.Row([
528
- dbc.Col(html.H1(
529
- "Trend Analysis Dashboard ",
530
- style={
531
- "textAlign": "center",
532
- "color": "white",
533
- "marginBottom": "5px",
534
- "fontFamily": "Arial Black",
535
- "textShadow": "2px 2px 8px rgba(0,0,0,0.7)",
536
- "letterSpacing": "2px",
537
- "fontSize": "42px",
538
- "background": "linear-gradient(135deg, #790091 0%, #565cd5 100%)",
539
- "WebkitBackgroundClip": "text",
540
- "WebkitTextFillColor": "transparent",
541
- "paddingTop": "10px"
542
- }
543
- ), width=10),
544
-
545
- dbc.Col([
546
- html.Button(
547
- [
548
- html.I(className="fas fa-download mr-2"),
549
- " Save Dashboard"
550
- ],
551
- id="download-button",
552
- className="btn btn-outline-light",
553
- style={
554
- "marginTop": "10px",
555
- "backgroundColor": "rgba(80, 80, 150, 0.4)",
556
- "border": "1px solid rgba(100, 100, 200, 0.5)",
557
- "borderRadius": "8px",
558
- "padding": "8px 15px",
559
- "boxShadow": "0px 4px 8px rgba(0, 0, 0, 0.3)",
560
- "transition": "all 0.3s ease",
561
- "fontSize": "14px",
562
- "fontWeight": "bold"
563
- }
564
- ),
565
- # Add the download component
566
- dcc.Download(id="download-dashboard")
567
- ], width=2),
568
-
569
- dbc.Col(html.P(
570
- "Interactive visualization of research topics and their relationships",
571
- style={
572
- "textAlign": "center",
573
- "color": "#aaddff",
574
- "marginBottom": "15px",
575
- "fontStyle": "italic",
576
- "fontSize": "16px",
577
- "fontWeight": "300",
578
- "letterSpacing": "0.5px",
579
- "textShadow": "1px 1px 3px rgba(0,0,0,0.5)",
580
- }
581
- ), width=12),
582
- ]),
583
-
584
- dbc.Row([
585
- dbc.Col(
586
- dbc.Card(
587
- dbc.CardBody([
588
- dcc.Graph(
589
- id="cluster-graph",
590
- figure=fig,
591
- config={
592
- "scrollZoom": True,
593
- "displayModeBar": True,
594
- "modeBarButtonsToRemove": ["select2d", "lasso2d"]
595
- }, style={"height": "80vh", "min-height": "800px"}
596
- )
597
- ], style={"height": "80vh", "min-height": "800px"}),
598
- style={
599
- "backgroundColor": "rgba(20, 20, 40, 0.7)",
600
- "borderRadius": "15px",
601
- "boxShadow": "0px 10px 30px rgba(0, 0, 0, 0.5)",
602
- "border": "1px solid rgba(100, 100, 200, 0.3)",
603
- "height": "80vh",
604
- "min-height": "800px" # Ensure minimum height
605
- }
606
- ),
607
- width=9
608
- ),
609
-
610
- dbc.Col(
611
- dbc.Card(
612
- dbc.CardBody([
613
- html.H3("Paper List", style={
614
- "textAlign": "center",
615
- "marginBottom": "15px",
616
- "color": "#ffffff",
617
- "fontFamily": "Arial",
618
- "fontWeight": "bold",
619
- "textShadow": "1px 1px 3px rgba(0,0,0,0.3)"
620
- }),
621
- html.Hr(style={"borderColor": "rgba(100, 100, 200, 0.3)", "margin": "10px 0 20px 0"}),
622
- html.Div(
623
- id="paper-list",
624
- style={
625
- "overflowY": "auto",
626
- "height": "700px",
627
- "padding": "5px"
628
- },
629
- children=html.Div([
630
- html.Div(
631
- html.I(className="fas fa-mouse-pointer", style={"marginRight": "10px"}),
632
- style={"textAlign": "center", "fontSize": "24px", "marginBottom": "10px",
633
- "color": "#7f8fa6"}
634
- ),
635
- html.P("Click on a cluster to view its papers",
636
- style={"textAlign": "center", "color": "#7f8fa6"})
637
- ])
638
- ),
639
- ],
640
- style={
641
- "backgroundColor": "rgba(30, 30, 50, 0.8)",
642
- "borderRadius": "15px",
643
- "padding": "20px",
644
- "height": "100%"
645
- }),
646
- style={
647
- "height": "800px",
648
- "boxShadow": "0px 10px 30px rgba(0, 0, 0, 0.5)",
649
- "border": "1px solid rgba(100, 100, 200, 0.3)",
650
- "borderRadius": "15px"
651
- }
652
- ),
653
- width=3
654
- ),
655
- ], style={"marginTop": "20px"}),
656
-
657
- # Add a footer with theme legend
658
- dbc.Row([
659
- dbc.Col(
660
- dbc.Card(
661
- dbc.CardBody([
662
- html.H5("Theme Legend", style={"textAlign": "center", "marginBottom": "15px"}),
663
- dbc.Row([
664
- dbc.Col(html.Div([
665
- html.Span("🔥", style={"fontSize": "20px", "marginRight": "10px"}),
666
- "Hot Topic: High citations & high influence"
667
- ]), width=3),
668
- dbc.Col(html.Div([
669
- html.Span("💎", style={"fontSize": "20px", "marginRight": "10px"}),
670
- "Gap Opportunity: Low citations but high influence"
671
- ]), width=3),
672
- dbc.Col(html.Div([
673
- html.Span("⚠️", style={"fontSize": "20px", "marginRight": "10px"}),
674
- "Risky Theme: High citations but low influence"
675
- ]), width=3),
676
- dbc.Col(html.Div([
677
- html.Span("🔄", style={"fontSize": "20px", "marginRight": "10px"}),
678
- "Neutral: Average citations and influence"
679
- ]), width=3),
680
- ])
681
- ]),
682
- style={
683
- "backgroundColor": "rgba(30, 30, 50, 0.8)",
684
- "borderRadius": "15px",
685
- "marginTop": "20px",
686
- "boxShadow": "0px 5px 15px rgba(0, 0, 0, 0.3)",
687
- "border": "1px solid rgba(100, 100, 200, 0.3)"
688
- }
689
- ),
690
- width=12
691
- ),
692
-
693
- ]),
694
-
695
- dcc.Store(id="stored-figure", data=fig)
696
- ]
697
-
698
- )
699
-
700
- @app.callback(
701
- Output("download-dashboard", "data"),
702
- Input("download-button", "n_clicks"),
703
- State("cluster-graph", "figure"),
704
- prevent_initial_call=True
705
- )
706
- def download_dashboard(n_clicks, figure):
707
- if n_clicks is None:
708
- return None
709
-
710
- # Save the figure as HTML with full plotly.js included
711
- dashboard_html = pio.to_html(
712
- figure,
713
- full_html=True,
714
- include_plotlyjs='cdn',
715
- config={'responsive': True}
716
- )
717
-
718
- # Return the dashboard as an HTML file
719
- return dict(
720
- content=dashboard_html,
721
- filename="research_dashboard.html",
722
- type="text/html",
723
- )
724
-
725
- # Enhanced callback to update paper list with better styling
726
- # Enhanced callback to update paper list with better styling
727
- @app.callback(
728
- Output("paper-list", "children"),
729
- [Input("cluster-graph", "clickData")]
730
- )
731
- def update_paper_list(clickData):
732
- if clickData is None:
733
- return html.Div([
734
- html.Div(
735
- html.I(className="fas fa-mouse-pointer", style={"marginRight": "10px"}),
736
- style={"textAlign": "center", "fontSize": "24px", "marginBottom": "10px", "color": "#7f8fa6"}
737
- ),
738
- html.P("Click on a cluster to view its papers",
739
- style={"textAlign": "center", "color": "#7f8fa6"})
740
- ])
741
-
742
- # Extract the clicked cluster ID
743
- try:
744
- clicked_topic = clickData["points"][0]["customdata"][0]
745
-
746
- # Get the color for this topic for styling consistency
747
- topic_color = color_map[clicked_topic]
748
-
749
- # Get the theme for this topic
750
- topic_theme = df[df["topic"] == clicked_topic]["theme"].iloc[0]
751
-
752
- except (KeyError, IndexError):
753
- return html.Div("Error retrieving cluster data.", style={"textAlign": "center", "marginTop": "20px"})
754
-
755
- # Filter papers in the clicked cluster - UPDATED to include titles AND urls
756
- papers_in_cluster = df[df["topic"] == clicked_topic][["title", "url", "paperId"]]
757
-
758
- if papers_in_cluster.empty:
759
- return html.Div(f"No papers found for Cluster {clicked_topic}.",
760
- style={"textAlign": "center", "marginTop": "20px"})
761
-
762
- # Get topic label
763
- topic_label = df[df["topic"] == clicked_topic]['topic_label'].iloc[
764
- 0] if 'topic_label' in df.columns else f"Cluster {clicked_topic}"
765
-
766
- # Create an enhanced styled list of paper titles - UPDATED to make clickable
767
- paper_list = []
768
- for i, (_, paper) in enumerate(papers_in_cluster.iterrows()):
769
- paper_url = paper["url"]
770
- paper_title = paper["title"]
771
-
772
- paper_list.append(
773
- dbc.Card(
774
- dbc.CardBody([
775
- html.A(
776
- html.H6(
777
- f"{i + 1}. {paper_title}",
778
- className="card-title",
779
- style={
780
- "fontSize": "14px",
781
- "margin": "5px 0",
782
- "fontWeight": "normal",
783
- "lineHeight": "1.4",
784
- "color": "#aaccff", # Blue color to indicate clickable link
785
- "cursor": "pointer"
786
- }
787
- ),
788
- href=paper_url,
789
- target="_blank", # Open in new tab
790
- style={"textDecoration": "none"}
791
- ),
792
- ], style={"padding": "12px"}),
793
- style={
794
- "marginBottom": "10px",
795
- "backgroundColor": "rgba(40, 45, 60, 0.8)",
796
- "borderRadius": "8px",
797
- "borderLeft": f"4px solid {topic_color}",
798
- "boxShadow": "0px 3px 8px rgba(0, 0, 0, 0.2)",
799
- "transition": "transform 0.2s",
800
- ":hover": {
801
- "transform": "translateY(-2px)",
802
- "boxShadow": "0px 5px 10px rgba(0, 0, 0, 0.3)"
803
- }
804
- },
805
- className="paper-card"
806
- )
807
- )
808
-
809
- return html.Div([
810
- html.Div([
811
- html.H4(
812
- f"Cluster {clicked_topic}",
813
- style={
814
- "textAlign": "center",
815
- "marginBottom": "5px",
816
- "color": topic_color,
817
- "fontWeight": "bold"
818
- }
819
- ),
820
- html.H5(
821
- topic_label,
822
- style={
823
- "textAlign": "center",
824
- "marginBottom": "5px",
825
- "color": "#aaaacc",
826
- "fontStyle": "italic",
827
- "fontWeight": "normal"
828
- }
829
- ),
830
- html.Div(
831
- topic_theme,
832
- style={
833
- "textAlign": "center",
834
- "marginBottom": "15px",
835
- "fontSize": "16px",
836
- "fontWeight": "bold"
837
- }
838
- ),
839
- html.Hr(style={"borderColor": "rgba(100, 100, 200, 0.3)", "margin": "10px 0 20px 0"}),
840
- html.H5(
841
- f"Papers ({len(papers_in_cluster)})",
842
- style={
843
- "textAlign": "left",
844
- "marginBottom": "15px",
845
- "color": "#ffffff",
846
- "fontWeight": "bold"
847
- }
848
- ),
849
- ]),
850
- html.Div(
851
- paper_list,
852
- style={"paddingRight": "10px"},
853
- )
854
- ])
855
-
856
- # Add custom CSS for hover effects
857
- app.index_string = '''
858
- <!DOCTYPE html>
859
- <html>
860
- <head>
861
- {%metas%}
862
- <title>Trend Analysis Clusters Dashboard</title>
863
- {%favicon%}
864
- {%css%}
865
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
866
- <style>
867
- .paper-card:hover {
868
- transform: translateY(-2px);
869
- box-shadow: 0px 5px 10px rgba(0, 0, 0, 0.3);
870
- background-color: rgba(50, 55, 70, 0.8) !important;
871
- }
872
- a h6:hover {
873
- color: #ffffff !important;
874
- text-decoration: underline;
875
- }
876
- /* Add subtle scroll bar styling */
877
- ::-webkit-scrollbar {
878
- width: 8px;
879
- }
880
- ::-webkit-scrollbar-track {
881
- background: rgba(30, 30, 50, 0.3);
882
- border-radius: 10px;
883
- }
884
- ::-webkit-scrollbar-thumb {
885
- background: rgba(100, 100, 200, 0.5);
886
- border-radius: 10px;
887
- }
888
- ::-webkit-scrollbar-thumb:hover {
889
- background: rgba(120, 120, 220, 0.7);
890
- }
891
- </style>
892
- </head>
893
- <body>
894
- {%app_entry%}
895
- <footer>
896
- {%config%}
897
- {%scripts%}
898
- {%renderer%}
899
- </footer>
900
- </body>
901
- </html>
902
- '''
903
- return app
904
-
905
-
906
- # Global variables to track Dash app state
907
- dash_thread = None
908
- dash_app = None
909
- DASH_PORT = 7050
910
-
911
-
912
- # Simplified shutdown function that doesn't rely on request or psutil connections
913
- def shutdown_dash_app():
914
- global dash_thread, dash_app
915
-
916
- if dash_app is not None:
917
- try:
918
- print("Shutting down previous Dash app...")
919
-
920
- # If we have a Dash app with a server
921
- if hasattr(dash_app, 'server'):
922
- # Set a shutdown flag
923
- dash_app._shutdown = True
924
-
925
- # Force the thread to terminate
926
- if dash_thread and dash_thread.is_alive():
927
- import ctypes
928
- ctypes.pythonapi.PyThreadState_SetAsyncExc(
929
- ctypes.c_long(dash_thread.ident),
930
- ctypes.py_object(SystemExit)
931
- )
932
- dash_thread.join(timeout=2)
933
-
934
- # Try to find and kill the process using the port
935
- try:
936
- import psutil
937
- import os
938
- import signal
939
-
940
- for proc in psutil.process_iter(['pid']):
941
- try:
942
- for conn in proc.connections(kind='inet'):
943
- if conn.laddr.port == DASH_PORT:
944
- print(f"Killing process {proc.pid} using port {DASH_PORT}")
945
- os.kill(proc.pid, signal.SIGTERM)
946
- except:
947
- pass
948
- except:
949
- print("Could not find process using port")
950
-
951
- # Clear references
952
- dash_app = None
953
- print("Previous Dash app successfully shut down")
954
- return True
955
-
956
- except Exception as e:
957
- print(f"Error shutting down Dash app: {e}")
958
- # Even if there were errors, reset the state
959
- dash_app = None
960
- return True
961
-
962
- return True # No app to shut down
963
-
964
-
965
- # Updated function to run Dash with error handling
966
- def run_dash(df, titleNm, Topic_year):
967
- global dash_app
968
-
969
- try:
970
- # Build the dashboard
971
- dash_app = build_dashboard(df, titleNm, Topic_year)
972
-
973
- # Run the server
974
- dash_app.run_server(debug=False, port=DASH_PORT, use_reloader=False)
975
- except Exception as e:
976
- print(f"Error running Dash app: {e}")
977
- dash_app = None
978
-
979
-
980
- # Update your endpoint - removed request parameter from shutdown_dash_app
981
- @router.post("/analyze-trends/")
982
- async def analyze_trends(request: Request, data_request: TrendAnalysisRequest):
983
- global dash_thread
984
- TitleName = data_request.topic
985
- Topic_year = data_request.year
986
- # First, ensure any existing dashboard is properly shut down
987
- shutdown_dash_app()
988
-
989
- # Short delay to ensure port is freed
990
- import time
991
- time.sleep(1)
992
-
993
- # Fetch and process data
994
- df, current_page, total_pages, papers_count, total_papers = await fetch_papers_with_pagination(
995
- request, data_request.userId, data_request.topic, data_request.year, data_request.page
996
- )
997
-
998
- if df.empty and total_papers > 0:
999
- raise HTTPException(
1000
- status_code=404,
1001
- detail=f"No papers found for page {data_request.page + 1}. Valid pages are 1 to {total_pages}."
1002
- )
1003
- elif df.empty:
1004
- raise HTTPException(
1005
- status_code=404,
1006
- detail=f"No papers found for userId '{data_request.userId}', topic '{data_request.topic}'" +
1007
- (f", and year '{data_request.year}'" if data_request.year else "")
1008
- )
1009
-
1010
- # Perform the trend analysis
1011
- df, topic_labels = perform_trend_analysis(df)
1012
-
1013
- if df.empty:
1014
- raise HTTPException(status_code=500, detail="Failed to process embeddings for trend analysis")
1015
-
1016
- # Create cluster statistics
1017
- cluster_sizes = df.groupby("topic").size().to_dict()
1018
-
1019
- # Create and start a new thread for the dashboard
1020
- dash_thread = threading.Thread(target=run_dash, args=(df, TitleName, Topic_year))
1021
- dash_thread.daemon = True
1022
- dash_thread.start()
1023
-
1024
- # Open browser automatically
1025
- browser_thread = threading.Thread(target=open_browser)
1026
- browser_thread.daemon = True
1027
- browser_thread.start()
1028
-
1029
- return {
1030
- "message": f"Trend analysis completed for papers (page {current_page + 1} of {total_pages})",
1031
- "current_page": current_page,
1032
- "total_pages": total_pages,
1033
- "papers_count": papers_count,
1034
- "total_papers": total_papers,
1035
- "cluster_sizes": cluster_sizes,
1036
- "cluster_titles": topic_labels,
1037
- "dashboard_url": f"http://localhost:{DASH_PORT}"
1038
- }
1039
-
1040
-
1041
- # Function to open browser after a short delay
1042
- def open_browser():
1043
- time.sleep(2) # Wait for servers to start
1044
- webbrowser.open_new(f"http://localhost:{DASH_PORT}")
 
1
+ from motor.motor_asyncio import AsyncIOMotorClient
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import json
6
+ import umap
7
+ import plotly.io as pio
8
+ import hdbscan
9
+ from bertopic import BERTopic
10
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
11
+ from skopt import gp_minimize
12
+ from sentence_transformers import SentenceTransformer
13
+ import torch
14
+ import random
15
+ import multiprocessing
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+ from bertopic.vectorizers import ClassTfidfTransformer
18
+ from bertopic.representation import KeyBERTInspired
19
+ import optuna
20
+ import pandas as pd
21
+ import dash
22
+ from dash import dcc, html, Input, Output, State
23
+ import plotly.graph_objects as go
24
+ import plotly.express as px
25
+ import numpy as np
26
+ import dash_bootstrap_components as dbc
27
+ from fastapi import HTTPException, APIRouter, Request
28
+ from pydantic import BaseModel
29
+ import threading
30
+ import time
31
+ import webbrowser
32
+ import asyncio
33
+
34
+
35
+ # Set seed for reproducibility
36
+ def set_seed(seed=42):
37
+ random.seed(seed)
38
+ np.random.seed(seed)
39
+ torch.manual_seed(seed)
40
+ torch.cuda.manual_seed_all(seed)
41
+ torch.backends.cudnn.deterministic = True
42
+ torch.backends.cudnn.benchmark = False
43
+
44
+
45
+ if __name__ == "__main__":
46
+ set_seed(42)
47
+ multiprocessing.freeze_support()
48
+
49
+ global TitleName
50
+ TitleName = "Dashboard"
51
+ router = APIRouter()
52
+
53
+
54
+ class TrendAnalysisRequest(BaseModel):
55
+ userId: str
56
+ topic: str
57
+ year: str = None
58
+ page: int = 0
59
+
60
+
61
+ async def fetch_papers_with_pagination(request: Request, userId: str, topic: str, year: str = None, page: int = 0):
62
+ # Build the query filter
63
+ query_filter = {"userId": userId, "topic": topic}
64
+ if year:
65
+ query_filter["year"] = year
66
+
67
+ # Count total matching documents
68
+ count_pipeline = [
69
+ {"$match": query_filter},
70
+ {"$unwind": "$papers"},
71
+ {"$count": "total_papers"}
72
+ ]
73
+ collection = request.app.state.collection
74
+ count_result = await collection.aggregate(count_pipeline).to_list(length=1)
75
+ total_papers = count_result[0]['total_papers'] if count_result else 0
76
+
77
+ print(f"Total papers matching criteria: {total_papers}")
78
+
79
+ # If no papers found, return empty result
80
+ if total_papers == 0:
81
+ return pd.DataFrame(), 0, 0, 0, 0
82
+
83
+ # Define pagination constants
84
+ papers_per_page = 200
85
+ min_papers_last_page = 50
86
+
87
+ # Calculate basic pagination
88
+ if total_papers <= papers_per_page:
89
+ # Simple case: all papers fit in one page
90
+ total_pages = 1
91
+ else:
92
+ # Multiple pages case
93
+ full_pages = total_papers // papers_per_page
94
+ remaining = total_papers % papers_per_page
95
+
96
+ if remaining >= min_papers_last_page:
97
+ # If remaining papers meet minimum threshold, create a separate page
98
+ total_pages = full_pages + 1
99
+ else:
100
+ # Otherwise, we'll have exactly 'full_pages' pages
101
+ # The remaining papers will be added to the last page
102
+ total_pages = full_pages
103
+
104
+ # Ensure page is within valid range
105
+ if page >= total_pages:
106
+ return pd.DataFrame(), 0, total_pages, 0, total_papers
107
+
108
+ # Calculate skip and limit based on page number
109
+ if total_pages == 1:
110
+ # Only one page - return all papers
111
+ skip = 0
112
+ limit = total_papers
113
+ elif page < total_pages - 1:
114
+ # Regular full page
115
+ skip = page * papers_per_page
116
+ limit = papers_per_page
117
+ else:
118
+ # Last page - might include remaining papers
119
+ remaining = total_papers % papers_per_page
120
+
121
+ if remaining >= min_papers_last_page or remaining == 0:
122
+ # Last page with either enough remaining papers or perfectly divided
123
+ skip = page * papers_per_page
124
+ limit = remaining if remaining > 0 else papers_per_page
125
+ else:
126
+ # Last page with remaining papers that don't meet minimum threshold
127
+ # We distribute by adding them to the last page
128
+ skip = (total_pages - 1) * papers_per_page
129
+ limit = papers_per_page + remaining
130
+
131
+ print(f"Pagination: Page {page + 1} of {total_pages}, Skip {skip}, Limit {limit}")
132
+
133
+ # MongoDB aggregation pipeline
134
+ pipeline = [
135
+ {"$match": query_filter},
136
+ {"$unwind": "$papers"},
137
+ {"$replaceRoot": {"newRoot": "$papers"}},
138
+ {"$project": {
139
+ "_id": 0,
140
+ "paperId": 1,
141
+ "url": 1,
142
+ "title": 1,
143
+ "abstract": 1,
144
+ "citationCount": 1,
145
+ "influentialCitationCount": 1,
146
+ "embedding": 1,
147
+ "publicationDate": 1,
148
+ "authors": 1
149
+ }},
150
+ {"$sort": {"publicationDate": 1}},
151
+ {"$skip": skip},
152
+ {"$limit": limit}
153
+ ]
154
+
155
+ # Execute the aggregation pipeline
156
+ cursor = collection.aggregate(pipeline)
157
+ papers = await cursor.to_list(None)
158
+
159
+ papers_count = len(papers)
160
+ print(f"Papers Retrieved: {papers_count}")
161
+
162
+ # Convert to DataFrame
163
+ df = pd.DataFrame(papers)
164
+ df = df.sort_values(by="publicationDate")
165
+ print(df[["paperId", "publicationDate"]].head(10))
166
+
167
+ return df, page, total_pages, papers_count, total_papers
168
+
169
+
170
+ # Preprocessing function
171
+ def clean_text(text):
172
+ text = str(text).lower()
173
+ text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
174
+ return ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
175
+
176
+
177
+ # Adaptive clustering and topic modeling
178
+ def perform_trend_analysis(df):
179
+ # Convert embeddings
180
+ def convert_embedding(embedding):
181
+ return np.array(embedding["vector"], dtype=np.float64) if isinstance(embedding,
182
+ dict) and "vector" in embedding else None
183
+
184
+ df["embedding"] = df["embedding"].apply(convert_embedding)
185
+ df = df.dropna(subset=["embedding"])
186
+
187
+ if df.empty:
188
+ return df, {}
189
+
190
+ df["clean_text"] = (df["abstract"].fillna("")).apply(clean_text)
191
+
192
+ def objective(trial):
193
+ umap_n_components = trial.suggest_int("umap_n_components", 1, 12)
194
+ umap_min_dist = trial.suggest_float("umap_min_dist", 0.1, 0.8)
195
+ umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 2, 12)
196
+ hdbscan_min_cluster_size = trial.suggest_int("hdbscan_min_cluster_size", 2, 10)
197
+ hdbscan_min_samples = trial.suggest_int("hdbscan_min_samples", 1, 10)
198
+ hdbscan_cluster_selection_epsilon = trial.suggest_float("hdbscan_cluster_selection_epsilon", 0.2, 0.8)
199
+ hdbscan_cluster_selection_method = trial.suggest_categorical("hdbscan_cluster_selection_method",
200
+ ["eom", "leaf"])
201
+
202
+ reducer_high_dim = umap.UMAP(
203
+ n_components=umap_n_components,
204
+ random_state=42,
205
+ min_dist=umap_min_dist,
206
+ n_neighbors=umap_n_neighbors,
207
+ metric="cosine"
208
+ )
209
+ reduced_embeddings_high_dim = reducer_high_dim.fit_transform(np.vstack(df["embedding"].values)).astype(
210
+ np.float64)
211
+
212
+ clusterer = hdbscan.HDBSCAN(
213
+ min_cluster_size=hdbscan_min_cluster_size,
214
+ min_samples=hdbscan_min_samples,
215
+ cluster_selection_epsilon=hdbscan_cluster_selection_epsilon,
216
+ cluster_selection_method=hdbscan_cluster_selection_method,
217
+ prediction_data=True,
218
+ core_dist_n_jobs=1
219
+ )
220
+ labels = clusterer.fit_predict(reduced_embeddings_high_dim)
221
+
222
+ if len(set(labels)) > 1:
223
+ dbcv_score = hdbscan.validity.validity_index(reduced_embeddings_high_dim, labels)
224
+ else:
225
+ dbcv_score = -np.inf
226
+
227
+ return dbcv_score
228
+
229
+ study = optuna.create_study(
230
+ direction="maximize",
231
+ sampler=optuna.samplers.TPESampler(seed=42))
232
+ study.optimize(objective, n_trials=100)
233
+
234
+ best_params = study.best_params
235
+ umap_model = umap.UMAP(
236
+ n_components=best_params["umap_n_components"],
237
+ random_state=42,
238
+ min_dist=best_params["umap_min_dist"],
239
+ n_neighbors=best_params["umap_n_neighbors"],
240
+ metric="cosine"
241
+ )
242
+ hdbscan_model = hdbscan.HDBSCAN(
243
+ min_cluster_size=best_params["hdbscan_min_cluster_size"],
244
+ min_samples=best_params["hdbscan_min_samples"],
245
+ cluster_selection_epsilon=best_params["hdbscan_cluster_selection_epsilon"],
246
+ cluster_selection_method=best_params["hdbscan_cluster_selection_method"],
247
+ prediction_data=True,
248
+ core_dist_n_jobs=1
249
+ )
250
+
251
+ vectorizer = CountVectorizer(
252
+ stop_words=list(ENGLISH_STOP_WORDS),
253
+ ngram_range=(2, 3)
254
+ )
255
+
256
+ representation_model = KeyBERTInspired()
257
+ embedding_model = SentenceTransformer("allenai/specter")
258
+ topic_model = BERTopic(
259
+ vectorizer_model=vectorizer,
260
+ umap_model=umap_model,
261
+ hdbscan_model=hdbscan_model,
262
+ embedding_model=embedding_model,
263
+ nr_topics='auto',
264
+ top_n_words=8,
265
+ representation_model=representation_model,
266
+ ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)
267
+ )
268
+
269
+ topics, _ = topic_model.fit_transform(df["clean_text"], np.vstack(df["embedding"].values))
270
+ df["topic"] = topics
271
+ topic_labels = {t: " | ".join([word for word, _ in topic_model.get_topic(t)][:8]) for t in set(topics)}
272
+
273
+ reduced_embeddings_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(
274
+ np.vstack(df["embedding"].values)).astype(np.float64)
275
+ df["x"] = reduced_embeddings_2d[:, 0]
276
+ df["y"] = reduced_embeddings_2d[:, 1]
277
+ df["topic_label"] = df["topic"].map(topic_labels)
278
+
279
+ return df, topic_labels
280
+
281
+
282
+ def build_dashboard(df, titleNm, topic_year):
283
+ TitleName = titleNm + "_" + topic_year
284
+ color_palette = px.colors.qualitative.Vivid
285
+ unique_topics = sorted(df["topic"].unique())
286
+ color_map = {topic: color_palette[i % len(color_palette)] for i, topic in enumerate(unique_topics)}
287
+
288
+ # Map colors to topics
289
+ df["color"] = df["topic"].map(color_map)
290
+
291
+ # Calculate the number of papers in each cluster
292
+ cluster_sizes = df.groupby("topic").size().reset_index(name="paper_count")
293
+ df = df.merge(cluster_sizes, on="topic", how="left")
294
+
295
+ # Improved marker scaling with a better range
296
+ min_size = 50
297
+ max_size = 140
298
+ df["marker_size"] = ((df["paper_count"] - df["paper_count"].min()) /
299
+ (df["paper_count"].max() - df["paper_count"].min())) * (max_size - min_size) + min_size
300
+
301
+ # Add log-transformed citation and influence columns
302
+ df["log_citation"] = np.log1p(df["citationCount"])
303
+ df["log_influence"] = np.log1p(df["influentialCitationCount"])
304
+
305
+ # Bayesian shrinkage for citations and influence
306
+ global_median_citation = df["log_citation"].median()
307
+ global_median_influence = df["log_influence"].median()
308
+ C = 10 # Shrinkage constant
309
+
310
+ def bayesian_shrinkage(group, global_median, C):
311
+ return (group.sum() + C * global_median) / (len(group) + C)
312
+
313
+ adjusted_citations = df.groupby("topic")["log_citation"].apply(
314
+ lambda x: bayesian_shrinkage(x, global_median_citation, C))
315
+ adjusted_influence = df.groupby("topic")["log_influence"].apply(
316
+ lambda x: bayesian_shrinkage(x, global_median_influence, C))
317
+
318
+ # Merge adjusted metrics back into the dataframe
319
+ df = df.merge(adjusted_citations.rename("adjusted_citation"), on="topic")
320
+ df = df.merge(adjusted_influence.rename("adjusted_influence"), on="topic")
321
+
322
+ # Calculate global percentiles for thresholds
323
+ citation_25th = df["adjusted_citation"].quantile(0.25)
324
+ citation_75th = df["adjusted_citation"].quantile(0.75)
325
+ influence_25th = df["adjusted_influence"].quantile(0.25)
326
+ influence_75th = df["adjusted_influence"].quantile(0.75)
327
+
328
+ # Enhanced theme classification with more distinct emojis
329
+ def classify_theme(row):
330
+ if row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] >= influence_75th:
331
+ return "🔥 Hot Topic"
332
+ elif row["adjusted_citation"] <= citation_25th and row["adjusted_influence"] >= influence_75th:
333
+ return "💎 Gap Opportunity"
334
+ elif row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] <= influence_25th:
335
+ return "⚠️ Risky Theme"
336
+ else:
337
+ return "🔄 Neutral"
338
+
339
+ df["theme"] = df.apply(classify_theme, axis=1)
340
+
341
+ # Initialize the Dash app with an improved Bootstrap theme
342
+ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY]) # DARKLY for a sleek dark theme
343
+
344
+ # Create a more visually appealing figure
345
+ fig = go.Figure()
346
+
347
+ # Add subtle grid lines for reference
348
+ fig.update_xaxes(
349
+ showgrid=True,
350
+ gridwidth=0.1,
351
+ gridcolor='rgba(255, 255, 255, 0.05)',
352
+ zeroline=False
353
+ )
354
+ fig.update_yaxes(
355
+ showgrid=True,
356
+ gridwidth=0.1,
357
+ gridcolor='rgba(255, 255, 255, 0.05)',
358
+ zeroline=False
359
+ )
360
+
361
+ for topic in unique_topics:
362
+ topic_data = df[df["topic"] == topic]
363
+
364
+ # Get cluster center
365
+ center_x = topic_data["x"].mean()
366
+ center_y = topic_data["y"].mean()
367
+
368
+ # Get label
369
+ full_topic_formatted = topic_data['topic_label'].iloc[
370
+ 0] if 'topic_label' in topic_data.columns else f"Cluster {topic}"
371
+
372
+ # Add a subtle glow effect with a larger outer circle
373
+ fig.add_trace(
374
+ go.Scatter(
375
+ x=[center_x],
376
+ y=[center_y],
377
+ mode="markers",
378
+ marker=dict(
379
+ color=color_map[topic],
380
+ size=topic_data["marker_size"].iloc[0] * 1.2, # Slightly larger for glow effect
381
+ opacity=0.3,
382
+ line=dict(width=0),
383
+ symbol="circle",
384
+ ),
385
+ showlegend=False,
386
+ hoverinfo="none",
387
+ )
388
+ )
389
+
390
+ # Add main cluster circle with enhanced styling
391
+ fig.add_trace(
392
+ go.Scatter(
393
+ x=[center_x],
394
+ y=[center_y],
395
+ mode="markers+text",
396
+ marker=dict(
397
+ color=color_map[topic],
398
+ size=topic_data["marker_size"].iloc[0],
399
+ opacity=0.85,
400
+ line=dict(width=2, color="white"),
401
+ symbol="circle",
402
+ ),
403
+ text=[f"{topic}"],
404
+ textposition="middle center",
405
+ textfont=dict(
406
+ family="Arial Black",
407
+ size=16,
408
+ color="white"
409
+ ),
410
+ name=f"{topic}",
411
+ hovertemplate=(
412
+ "<b>Cluster ID:</b> %{text}<br>" +
413
+ "<b>Name:</b><br>" + full_topic_formatted + "<br>" +
414
+ "<b>Papers:</b> " + str(topic_data["paper_count"].iloc[0]) + "<br>" +
415
+ "<b>Popularity:</b> " + (
416
+ "🔼 High" if topic_data["adjusted_citation"].iloc[0] >= citation_75th else "🔽 Low") +
417
+ f" (Adjusted Citation: {topic_data['adjusted_citation'].iloc[0]:.2f})<br>" +
418
+ "<b>Impactfulness:</b> " + (
419
+ "🔼 High" if topic_data["adjusted_influence"].iloc[0] >= influence_75th else "🔽 Low") +
420
+ f" (Adjusted Influence: {topic_data['adjusted_influence'].iloc[0]:.2f})<br>" +
421
+ "<b>Theme:</b> " + topic_data["theme"].iloc[0] +
422
+ "<extra></extra>"
423
+ ),
424
+ customdata=[[topic]],
425
+ )
426
+ )
427
+
428
+ # Add an aesthetic background with gradient
429
+ fig.update_layout(
430
+ shapes=[
431
+ # Improved gradient background
432
+ dict(
433
+ type="rect",
434
+ xref="paper",
435
+ yref="paper",
436
+ x0=0,
437
+ y0=0,
438
+ x1=1,
439
+ y1=1,
440
+ fillcolor="rgba(0, 0, 40, 0.95)",
441
+ line_width=0,
442
+ layer="below"
443
+ ),
444
+ # Add a subtle radial gradient effect
445
+ dict(
446
+ type="circle",
447
+ xref="paper",
448
+ yref="paper",
449
+ x0=0.3,
450
+ y0=0.3,
451
+ x1=0.7,
452
+ y1=0.7,
453
+ fillcolor="rgba(50, 50, 120, 0.2)",
454
+ line_width=0,
455
+ layer="below"
456
+ )
457
+ ],
458
+ template="plotly_dark",
459
+ title={
460
+ 'text': f"<b>{TitleName.title()}</b>",
461
+ 'y': 0.97,
462
+ 'x': 0.5,
463
+ 'xanchor': 'center',
464
+ 'yanchor': 'top',
465
+ 'font': dict(
466
+ family="Arial Black",
467
+ size=28,
468
+ color="white",
469
+ ),
470
+ 'xref': 'paper',
471
+ 'yref': 'paper',
472
+ },
473
+ margin=dict(l=40, r=40, b=150, t=100),
474
+ hovermode="closest",
475
+ xaxis=dict(showticklabels=False),
476
+ yaxis=dict(showticklabels=False),
477
+ paper_bgcolor="rgba(0,0,0,0)",
478
+ plot_bgcolor="rgba(0,0,0,0)",
479
+ dragmode="pan",
480
+ legend=dict(
481
+ orientation="h",
482
+ yanchor="bottom",
483
+ y=-0.15,
484
+ xanchor="center",
485
+ x=0.5,
486
+ bgcolor="rgba(30,30,60,0.5)",
487
+ bordercolor="rgba(255,255,255,0.2)",
488
+ borderwidth=1
489
+ ),
490
+ )
491
+
492
+ # Add subtle animation options
493
+ fig.update_layout(
494
+ updatemenus=[
495
+ dict(
496
+ type="buttons",
497
+ showactive=False,
498
+ buttons=[
499
+ dict(
500
+ label="Reset View",
501
+ method="relayout",
502
+ args=[{"xaxis.range": None, "yaxis.range": None}]
503
+ ),
504
+ ],
505
+ x=0.05,
506
+ y=0.05,
507
+ xanchor="left",
508
+ yanchor="bottom",
509
+ bgcolor="rgba(50,50,80,0.7)",
510
+ bordercolor="rgba(255,255,255,0.2)",
511
+ )
512
+ ]
513
+ )
514
+
515
+ # Enhanced app layout with modern design elements
516
+ app.layout = dbc.Container(
517
+ fluid=True,
518
+ style={
519
+ "backgroundColor": "#111122",
520
+ "minHeight": "100vh",
521
+ "height": "100%",
522
+ "width": "100%",
523
+ "backgroundImage": "linear-gradient(135deg, #111122 0%, #15162c 100%)",
524
+ "padding": "20px"
525
+ },
526
+ children=[
527
+ dbc.Row([
528
+ dbc.Col(html.H1(
529
+ "Trend Analysis Dashboard ",
530
+ style={
531
+ "textAlign": "center",
532
+ "color": "white",
533
+ "marginBottom": "5px",
534
+ "fontFamily": "Arial Black",
535
+ "textShadow": "2px 2px 8px rgba(0,0,0,0.7)",
536
+ "letterSpacing": "2px",
537
+ "fontSize": "42px",
538
+ "background": "linear-gradient(135deg, #790091 0%, #565cd5 100%)",
539
+ "WebkitBackgroundClip": "text",
540
+ "WebkitTextFillColor": "transparent",
541
+ "paddingTop": "10px"
542
+ }
543
+ ), width=10),
544
+
545
+ dbc.Col([
546
+ html.Button(
547
+ [
548
+ html.I(className="fas fa-download mr-2"),
549
+ " Save Dashboard"
550
+ ],
551
+ id="download-button",
552
+ className="btn btn-outline-light",
553
+ style={
554
+ "marginTop": "10px",
555
+ "backgroundColor": "rgba(80, 80, 150, 0.4)",
556
+ "border": "1px solid rgba(100, 100, 200, 0.5)",
557
+ "borderRadius": "8px",
558
+ "padding": "8px 15px",
559
+ "boxShadow": "0px 4px 8px rgba(0, 0, 0, 0.3)",
560
+ "transition": "all 0.3s ease",
561
+ "fontSize": "14px",
562
+ "fontWeight": "bold"
563
+ }
564
+ ),
565
+ # Add the download component
566
+ dcc.Download(id="download-dashboard")
567
+ ], width=2),
568
+
569
+ dbc.Col(html.P(
570
+ "Interactive visualization of research topics and their relationships",
571
+ style={
572
+ "textAlign": "center",
573
+ "color": "#aaddff",
574
+ "marginBottom": "15px",
575
+ "fontStyle": "italic",
576
+ "fontSize": "16px",
577
+ "fontWeight": "300",
578
+ "letterSpacing": "0.5px",
579
+ "textShadow": "1px 1px 3px rgba(0,0,0,0.5)",
580
+ }
581
+ ), width=12),
582
+ ]),
583
+
584
+ dbc.Row([
585
+ dbc.Col(
586
+ dbc.Card(
587
+ dbc.CardBody([
588
+ dcc.Graph(
589
+ id="cluster-graph",
590
+ figure=fig,
591
+ config={
592
+ "scrollZoom": True,
593
+ "displayModeBar": True,
594
+ "modeBarButtonsToRemove": ["select2d", "lasso2d"]
595
+ }, style={"height": "80vh", "min-height": "800px"}
596
+ )
597
+ ], style={"height": "80vh", "min-height": "800px"}),
598
+ style={
599
+ "backgroundColor": "rgba(20, 20, 40, 0.7)",
600
+ "borderRadius": "15px",
601
+ "boxShadow": "0px 10px 30px rgba(0, 0, 0, 0.5)",
602
+ "border": "1px solid rgba(100, 100, 200, 0.3)",
603
+ "height": "80vh",
604
+ "min-height": "800px" # Ensure minimum height
605
+ }
606
+ ),
607
+ width=9
608
+ ),
609
+
610
+ dbc.Col(
611
+ dbc.Card(
612
+ dbc.CardBody([
613
+ html.H3("Paper List", style={
614
+ "textAlign": "center",
615
+ "marginBottom": "15px",
616
+ "color": "#ffffff",
617
+ "fontFamily": "Arial",
618
+ "fontWeight": "bold",
619
+ "textShadow": "1px 1px 3px rgba(0,0,0,0.3)"
620
+ }),
621
+ html.Hr(style={"borderColor": "rgba(100, 100, 200, 0.3)", "margin": "10px 0 20px 0"}),
622
+ html.Div(
623
+ id="paper-list",
624
+ style={
625
+ "overflowY": "auto",
626
+ "height": "700px",
627
+ "padding": "5px"
628
+ },
629
+ children=html.Div([
630
+ html.Div(
631
+ html.I(className="fas fa-mouse-pointer", style={"marginRight": "10px"}),
632
+ style={"textAlign": "center", "fontSize": "24px", "marginBottom": "10px",
633
+ "color": "#7f8fa6"}
634
+ ),
635
+ html.P("Click on a cluster to view its papers",
636
+ style={"textAlign": "center", "color": "#7f8fa6"})
637
+ ])
638
+ ),
639
+ ],
640
+ style={
641
+ "backgroundColor": "rgba(30, 30, 50, 0.8)",
642
+ "borderRadius": "15px",
643
+ "padding": "20px",
644
+ "height": "100%"
645
+ }),
646
+ style={
647
+ "height": "800px",
648
+ "boxShadow": "0px 10px 30px rgba(0, 0, 0, 0.5)",
649
+ "border": "1px solid rgba(100, 100, 200, 0.3)",
650
+ "borderRadius": "15px"
651
+ }
652
+ ),
653
+ width=3
654
+ ),
655
+ ], style={"marginTop": "20px"}),
656
+
657
+ # Add a footer with theme legend
658
+ dbc.Row([
659
+ dbc.Col(
660
+ dbc.Card(
661
+ dbc.CardBody([
662
+ html.H5("Theme Legend", style={"textAlign": "center", "marginBottom": "15px"}),
663
+ dbc.Row([
664
+ dbc.Col(html.Div([
665
+ html.Span("🔥", style={"fontSize": "20px", "marginRight": "10px"}),
666
+ "Hot Topic: High citations & high influence"
667
+ ]), width=3),
668
+ dbc.Col(html.Div([
669
+ html.Span("💎", style={"fontSize": "20px", "marginRight": "10px"}),
670
+ "Gap Opportunity: Low citations but high influence"
671
+ ]), width=3),
672
+ dbc.Col(html.Div([
673
+ html.Span("⚠️", style={"fontSize": "20px", "marginRight": "10px"}),
674
+ "Risky Theme: High citations but low influence"
675
+ ]), width=3),
676
+ dbc.Col(html.Div([
677
+ html.Span("🔄", style={"fontSize": "20px", "marginRight": "10px"}),
678
+ "Neutral: Average citations and influence"
679
+ ]), width=3),
680
+ ])
681
+ ]),
682
+ style={
683
+ "backgroundColor": "rgba(30, 30, 50, 0.8)",
684
+ "borderRadius": "15px",
685
+ "marginTop": "20px",
686
+ "boxShadow": "0px 5px 15px rgba(0, 0, 0, 0.3)",
687
+ "border": "1px solid rgba(100, 100, 200, 0.3)"
688
+ }
689
+ ),
690
+ width=12
691
+ ),
692
+
693
+ ]),
694
+
695
+ dcc.Store(id="stored-figure", data=fig)
696
+ ]
697
+
698
+ )
699
+
700
+ @app.callback(
701
+ Output("download-dashboard", "data"),
702
+ Input("download-button", "n_clicks"),
703
+ State("cluster-graph", "figure"),
704
+ prevent_initial_call=True
705
+ )
706
+ def download_dashboard(n_clicks, figure):
707
+ if n_clicks is None:
708
+ return None
709
+
710
+ # Save the figure as HTML with full plotly.js included
711
+ dashboard_html = pio.to_html(
712
+ figure,
713
+ full_html=True,
714
+ include_plotlyjs='cdn',
715
+ config={'responsive': True}
716
+ )
717
+
718
+ # Return the dashboard as an HTML file
719
+ return dict(
720
+ content=dashboard_html,
721
+ filename="research_dashboard.html",
722
+ type="text/html",
723
+ )
724
+
725
+ # Enhanced callback to update paper list with better styling
726
+ # Enhanced callback to update paper list with better styling
727
+ @app.callback(
728
+ Output("paper-list", "children"),
729
+ [Input("cluster-graph", "clickData")]
730
+ )
731
+ def update_paper_list(clickData):
732
+ if clickData is None:
733
+ return html.Div([
734
+ html.Div(
735
+ html.I(className="fas fa-mouse-pointer", style={"marginRight": "10px"}),
736
+ style={"textAlign": "center", "fontSize": "24px", "marginBottom": "10px", "color": "#7f8fa6"}
737
+ ),
738
+ html.P("Click on a cluster to view its papers",
739
+ style={"textAlign": "center", "color": "#7f8fa6"})
740
+ ])
741
+
742
+ # Extract the clicked cluster ID
743
+ try:
744
+ clicked_topic = clickData["points"][0]["customdata"][0]
745
+
746
+ # Get the color for this topic for styling consistency
747
+ topic_color = color_map[clicked_topic]
748
+
749
+ # Get the theme for this topic
750
+ topic_theme = df[df["topic"] == clicked_topic]["theme"].iloc[0]
751
+
752
+ except (KeyError, IndexError):
753
+ return html.Div("Error retrieving cluster data.", style={"textAlign": "center", "marginTop": "20px"})
754
+
755
+ # Filter papers in the clicked cluster - UPDATED to include titles AND urls
756
+ papers_in_cluster = df[df["topic"] == clicked_topic][["title", "url", "paperId"]]
757
+
758
+ if papers_in_cluster.empty:
759
+ return html.Div(f"No papers found for Cluster {clicked_topic}.",
760
+ style={"textAlign": "center", "marginTop": "20px"})
761
+
762
+ # Get topic label
763
+ topic_label = df[df["topic"] == clicked_topic]['topic_label'].iloc[
764
+ 0] if 'topic_label' in df.columns else f"Cluster {clicked_topic}"
765
+
766
+ # Create an enhanced styled list of paper titles - UPDATED to make clickable
767
+ paper_list = []
768
+ for i, (_, paper) in enumerate(papers_in_cluster.iterrows()):
769
+ paper_url = paper["url"]
770
+ paper_title = paper["title"]
771
+
772
+ paper_list.append(
773
+ dbc.Card(
774
+ dbc.CardBody([
775
+ html.A(
776
+ html.H6(
777
+ f"{i + 1}. {paper_title}",
778
+ className="card-title",
779
+ style={
780
+ "fontSize": "14px",
781
+ "margin": "5px 0",
782
+ "fontWeight": "normal",
783
+ "lineHeight": "1.4",
784
+ "color": "#aaccff", # Blue color to indicate clickable link
785
+ "cursor": "pointer"
786
+ }
787
+ ),
788
+ href=paper_url,
789
+ target="_blank", # Open in new tab
790
+ style={"textDecoration": "none"}
791
+ ),
792
+ ], style={"padding": "12px"}),
793
+ style={
794
+ "marginBottom": "10px",
795
+ "backgroundColor": "rgba(40, 45, 60, 0.8)",
796
+ "borderRadius": "8px",
797
+ "borderLeft": f"4px solid {topic_color}",
798
+ "boxShadow": "0px 3px 8px rgba(0, 0, 0, 0.2)",
799
+ "transition": "transform 0.2s",
800
+ ":hover": {
801
+ "transform": "translateY(-2px)",
802
+ "boxShadow": "0px 5px 10px rgba(0, 0, 0, 0.3)"
803
+ }
804
+ },
805
+ className="paper-card"
806
+ )
807
+ )
808
+
809
+ return html.Div([
810
+ html.Div([
811
+ html.H4(
812
+ f"Cluster {clicked_topic}",
813
+ style={
814
+ "textAlign": "center",
815
+ "marginBottom": "5px",
816
+ "color": topic_color,
817
+ "fontWeight": "bold"
818
+ }
819
+ ),
820
+ html.H5(
821
+ topic_label,
822
+ style={
823
+ "textAlign": "center",
824
+ "marginBottom": "5px",
825
+ "color": "#aaaacc",
826
+ "fontStyle": "italic",
827
+ "fontWeight": "normal"
828
+ }
829
+ ),
830
+ html.Div(
831
+ topic_theme,
832
+ style={
833
+ "textAlign": "center",
834
+ "marginBottom": "15px",
835
+ "fontSize": "16px",
836
+ "fontWeight": "bold"
837
+ }
838
+ ),
839
+ html.Hr(style={"borderColor": "rgba(100, 100, 200, 0.3)", "margin": "10px 0 20px 0"}),
840
+ html.H5(
841
+ f"Papers ({len(papers_in_cluster)})",
842
+ style={
843
+ "textAlign": "left",
844
+ "marginBottom": "15px",
845
+ "color": "#ffffff",
846
+ "fontWeight": "bold"
847
+ }
848
+ ),
849
+ ]),
850
+ html.Div(
851
+ paper_list,
852
+ style={"paddingRight": "10px"},
853
+ )
854
+ ])
855
+
856
+ # Add custom CSS for hover effects
857
+ app.index_string = '''
858
+ <!DOCTYPE html>
859
+ <html>
860
+ <head>
861
+ {%metas%}
862
+ <title>Trend Analysis Clusters Dashboard</title>
863
+ {%favicon%}
864
+ {%css%}
865
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
866
+ <style>
867
+ .paper-card:hover {
868
+ transform: translateY(-2px);
869
+ box-shadow: 0px 5px 10px rgba(0, 0, 0, 0.3);
870
+ background-color: rgba(50, 55, 70, 0.8) !important;
871
+ }
872
+ a h6:hover {
873
+ color: #ffffff !important;
874
+ text-decoration: underline;
875
+ }
876
+ /* Add subtle scroll bar styling */
877
+ ::-webkit-scrollbar {
878
+ width: 8px;
879
+ }
880
+ ::-webkit-scrollbar-track {
881
+ background: rgba(30, 30, 50, 0.3);
882
+ border-radius: 10px;
883
+ }
884
+ ::-webkit-scrollbar-thumb {
885
+ background: rgba(100, 100, 200, 0.5);
886
+ border-radius: 10px;
887
+ }
888
+ ::-webkit-scrollbar-thumb:hover {
889
+ background: rgba(120, 120, 220, 0.7);
890
+ }
891
+ </style>
892
+ </head>
893
+ <body>
894
+ {%app_entry%}
895
+ <footer>
896
+ {%config%}
897
+ {%scripts%}
898
+ {%renderer%}
899
+ </footer>
900
+ </body>
901
+ </html>
902
+ '''
903
+ return app
904
+
905
+
906
+ # Global variables to track Dash app state
907
+ dash_thread = None
908
+ dash_app = None
909
+ DASH_PORT = 7860
910
+
911
+
912
+ # Simplified shutdown function that doesn't rely on request or psutil connections
913
+ def shutdown_dash_app():
914
+ global dash_thread, dash_app
915
+
916
+ if dash_app is not None:
917
+ try:
918
+ print("Shutting down previous Dash app...")
919
+
920
+ # If we have a Dash app with a server
921
+ if hasattr(dash_app, 'server'):
922
+ # Set a shutdown flag
923
+ dash_app._shutdown = True
924
+
925
+ # Force the thread to terminate
926
+ if dash_thread and dash_thread.is_alive():
927
+ import ctypes
928
+ ctypes.pythonapi.PyThreadState_SetAsyncExc(
929
+ ctypes.c_long(dash_thread.ident),
930
+ ctypes.py_object(SystemExit)
931
+ )
932
+ dash_thread.join(timeout=2)
933
+
934
+ # Try to find and kill the process using the port
935
+ try:
936
+ import psutil
937
+ import os
938
+ import signal
939
+
940
+ for proc in psutil.process_iter(['pid']):
941
+ try:
942
+ for conn in proc.connections(kind='inet'):
943
+ if conn.laddr.port == DASH_PORT:
944
+ print(f"Killing process {proc.pid} using port {DASH_PORT}")
945
+ os.kill(proc.pid, signal.SIGTERM)
946
+ except:
947
+ pass
948
+ except:
949
+ print("Could not find process using port")
950
+
951
+ # Clear references
952
+ dash_app = None
953
+ print("Previous Dash app successfully shut down")
954
+ return True
955
+
956
+ except Exception as e:
957
+ print(f"Error shutting down Dash app: {e}")
958
+ # Even if there were errors, reset the state
959
+ dash_app = None
960
+ return True
961
+
962
+ return True # No app to shut down
963
+
964
+
965
+ # Updated function to run Dash with error handling
966
+ def run_dash(df, titleNm, Topic_year):
967
+ global dash_app
968
+
969
+ try:
970
+ # Build the dashboard
971
+ dash_app = build_dashboard(df, titleNm, Topic_year)
972
+
973
+ # Run the server
974
+ dash_app.run_server(debug=False, port=DASH_PORT, use_reloader=False)
975
+ except Exception as e:
976
+ print(f"Error running Dash app: {e}")
977
+ dash_app = None
978
+
979
+
980
+ # Update your endpoint - removed request parameter from shutdown_dash_app
981
+ @router.post("/analyze-trends/")
982
+ async def analyze_trends(request: Request, data_request: TrendAnalysisRequest):
983
+ global dash_thread
984
+ TitleName = data_request.topic
985
+ Topic_year = data_request.year
986
+ # First, ensure any existing dashboard is properly shut down
987
+ shutdown_dash_app()
988
+
989
+ # Short delay to ensure port is freed
990
+ import time
991
+ time.sleep(1)
992
+
993
+ # Fetch and process data
994
+ df, current_page, total_pages, papers_count, total_papers = await fetch_papers_with_pagination(
995
+ request, data_request.userId, data_request.topic, data_request.year, data_request.page
996
+ )
997
+
998
+ if df.empty and total_papers > 0:
999
+ raise HTTPException(
1000
+ status_code=404,
1001
+ detail=f"No papers found for page {data_request.page + 1}. Valid pages are 1 to {total_pages}."
1002
+ )
1003
+ elif df.empty:
1004
+ raise HTTPException(
1005
+ status_code=404,
1006
+ detail=f"No papers found for userId '{data_request.userId}', topic '{data_request.topic}'" +
1007
+ (f", and year '{data_request.year}'" if data_request.year else "")
1008
+ )
1009
+
1010
+ # Perform the trend analysis
1011
+ df, topic_labels = perform_trend_analysis(df)
1012
+
1013
+ if df.empty:
1014
+ raise HTTPException(status_code=500, detail="Failed to process embeddings for trend analysis")
1015
+
1016
+ # Create cluster statistics
1017
+ cluster_sizes = df.groupby("topic").size().to_dict()
1018
+
1019
+ # Create and start a new thread for the dashboard
1020
+ dash_thread = threading.Thread(target=run_dash, args=(df, TitleName, Topic_year))
1021
+ dash_thread.daemon = True
1022
+ dash_thread.start()
1023
+
1024
+ # Open browser automatically
1025
+ browser_thread = threading.Thread(target=open_browser)
1026
+ browser_thread.daemon = True
1027
+ browser_thread.start()
1028
+
1029
+ return {
1030
+ "message": f"Trend analysis completed for papers (page {current_page + 1} of {total_pages})",
1031
+ "current_page": current_page,
1032
+ "total_pages": total_pages,
1033
+ "papers_count": papers_count,
1034
+ "total_papers": total_papers,
1035
+ "cluster_sizes": cluster_sizes,
1036
+ "cluster_titles": topic_labels,
1037
+ "dashboard_url": f"http://localhost:{DASH_PORT}"
1038
+ }
1039
+
1040
+
1041
+ # Function to open browser after a short delay
1042
+ def open_browser():
1043
+ time.sleep(2) # Wait for servers to start
1044
+ webbrowser.open_new(f"http://localhost:{DASH_PORT}")