YixuanWang commited on
Commit
c4ee5b7
verified
1 Parent(s): 583a378

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +232 -0
  2. requirements.txt +6 -0
  3. twitter_dataset.csv +0 -0
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from textblob import TextBlob
7
+ from typing import List, Dict, Tuple
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ import logging
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ @dataclass
16
+ class RecommendationWeights:
17
+ visibility: float
18
+ sentiment: float
19
+ popularity: float
20
+
21
+ class TweetPreprocessor:
22
+ def __init__(self, data_path: Path):
23
+ self.data = self._load_data(data_path)
24
+ self.model_name = "hamzab/roberta-fake-news-classification"
25
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.model, self.tokenizer = self._load_model()
27
+
28
+ def _load_model(self):
29
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
30
+ model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
31
+ return model, tokenizer
32
+
33
+ @staticmethod
34
+ def _load_data(data_path: Path) -> pd.DataFrame:
35
+ try:
36
+ data = pd.read_csv(data_path)
37
+ required_columns = {'Text', 'Retweets', 'Likes'}
38
+ if not required_columns.issubset(data.columns):
39
+ raise ValueError(f"Missing required columns: {required_columns - set(data.columns)}")
40
+ return data
41
+ except Exception as e:
42
+ logger.error(f"Error loading data: {e}")
43
+ raise
44
+
45
+ def calculate_metrics(self) -> pd.DataFrame:
46
+ # Calculate sentiment
47
+ self.data['Sentiment'] = self.data['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
48
+
49
+ # Calculate popularity
50
+ self.data['Popularity'] = self.data['Retweets'] + self.data['Likes']
51
+ self.data['Popularity'] = (self.data['Popularity'] - self.data['Popularity'].mean()) / self.data['Popularity'].std()
52
+ self.data['Popularity'] = self.data['Popularity'] / self.data['Popularity'].abs().max()
53
+
54
+ # Calculate credibility using fake news model
55
+ batch_size = 100
56
+ predictions = []
57
+ for i in range(0, len(self.data), batch_size):
58
+ batch = self.data['Text'][i:i + batch_size].tolist()
59
+ inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
60
+ inputs = {key: val.to(self.device) for key, val in inputs.items()}
61
+ with torch.no_grad():
62
+ outputs = self.model(**inputs)
63
+ predictions.extend(outputs.logits.argmax(dim=1).cpu().numpy())
64
+
65
+ self.data['Credibility'] = [1 if pred == 1 else -1 for pred in predictions]
66
+ return self.data
67
+
68
+ class RecommendationSystem:
69
+ def __init__(self, data_path: Path):
70
+ self.preprocessor = TweetPreprocessor(data_path)
71
+ self.data = None
72
+ self.setup_system()
73
+
74
+ def setup_system(self):
75
+ self.data = self.preprocessor.calculate_metrics()
76
+
77
+ def get_recommendations(self, weights: RecommendationWeights, num_recommendations: int = 10) -> Dict:
78
+ if not self._validate_weights(weights):
79
+ return {"error": "Invalid weights provided"}
80
+
81
+ normalized_weights = self._normalize_weights(weights)
82
+
83
+ self.data['Final_Score'] = (
84
+ self.data['Credibility'] * normalized_weights.visibility +
85
+ self.data['Sentiment'] * normalized_weights.sentiment +
86
+ self.data['Popularity'] * normalized_weights.popularity
87
+ )
88
+
89
+ top_recommendations = (
90
+ self.data.nlargest(100, 'Final_Score')
91
+ .sample(num_recommendations)
92
+ )
93
+
94
+ return self._format_recommendations(top_recommendations)
95
+
96
+ def _format_recommendations(self, recommendations: pd.DataFrame) -> Dict:
97
+ formatted_results = []
98
+ for _, row in recommendations.iterrows():
99
+ score_details = {
100
+ "score": f"{row['Final_Score']:.2f}",
101
+ "credibility": "Reliable" if row['Credibility'] > 0 else "Uncertain",
102
+ "sentiment": self._get_sentiment_label(row['Sentiment']),
103
+ "popularity": f"{row['Popularity']:.2f}",
104
+ "engagement": f"Likes {row['Likes']} 路 Retweets {row['Retweets']}"
105
+ }
106
+
107
+ formatted_results.append({
108
+ "text": row['Text'],
109
+ "scores": score_details
110
+ })
111
+
112
+ return {
113
+ "recommendations": formatted_results,
114
+ "score_explanation": self._get_score_explanation()
115
+ }
116
+
117
+ @staticmethod
118
+ def _get_sentiment_label(sentiment_score: float) -> str:
119
+ if sentiment_score > 0.3:
120
+ return "Positive"
121
+ elif sentiment_score < -0.3:
122
+ return "Negative"
123
+ return "Neutral"
124
+
125
+ @staticmethod
126
+ def _validate_weights(weights: RecommendationWeights) -> bool:
127
+ return all(getattr(weights, field) >= 0 for field in weights.__dataclass_fields__)
128
+
129
+ @staticmethod
130
+ def _normalize_weights(weights: RecommendationWeights) -> RecommendationWeights:
131
+ total = weights.visibility + weights.sentiment + weights.popularity
132
+ if total == 0:
133
+ return RecommendationWeights(1/3, 1/3, 1/3)
134
+ return RecommendationWeights(
135
+ visibility=weights.visibility / total,
136
+ sentiment=weights.sentiment / total,
137
+ popularity=weights.popularity / total
138
+ )
139
+
140
+ @staticmethod
141
+ def _get_score_explanation() -> Dict[str, str]:
142
+ return {
143
+ "Credibility": "Content reliability assessment",
144
+ "Sentiment": "Text emotional analysis result",
145
+ "Popularity": "Score based on likes and retweets"
146
+ }
147
+
148
+ def create_gradio_interface(recommendation_system: RecommendationSystem) -> gr.Interface:
149
+ with gr.Blocks(theme=gr.themes.Soft()) as interface:
150
+ gr.Markdown("""
151
+ # Tweet Recommendation System
152
+ Adjust weights to get personalized recommendations
153
+
154
+ Note: To protect user privacy, some tweet content has been redacted or anonymized.
155
+ """)
156
+
157
+ with gr.Row():
158
+ with gr.Column(scale=1):
159
+ visibility_weight = gr.Slider(0, 1, 0.5, label="Credibility Weight", info="Adjust importance of content credibility")
160
+ sentiment_weight = gr.Slider(0, 1, 0.3, label="Sentiment Weight", info="Adjust importance of emotional tone")
161
+ popularity_weight = gr.Slider(0, 1, 0.2, label="Popularity Weight", info="Adjust importance of engagement metrics")
162
+ submit_btn = gr.Button("Get Recommendations", variant="primary")
163
+
164
+ with gr.Column(scale=2):
165
+ output_html = gr.HTML()
166
+
167
+ def format_recommendations(raw_recommendations):
168
+ html = '<div style="font-family: sans-serif;">'
169
+
170
+ html += '''
171
+ <div style="margin-bottom: 20px; padding: 15px; background-color: #1a1a1a; color: white; border-radius: 8px;">
172
+ <h3 style="margin-top: 0;">Score Guide</h3>
173
+ <ul style="margin: 0;">
174
+ <li><strong>Credibility</strong>: Assessment of content reliability</li>
175
+ <li><strong>Sentiment</strong>: Text emotional analysis (Positive/Negative/Neutral)</li>
176
+ <li><strong>Popularity</strong>: Normalized score based on likes and retweets</li>
177
+ </ul>
178
+ </div>
179
+ '''
180
+
181
+ for i, rec in enumerate(raw_recommendations["recommendations"], 1):
182
+ scores = rec["scores"]
183
+ html += f'''
184
+ <div style="margin-bottom: 15px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;">
185
+ <div style="margin-bottom: 10px; font-size: 1.1em;">{rec["text"]}</div>
186
+ <div style="display: flex; flex-wrap: wrap; gap: 10px; font-size: 0.9em;">
187
+ <span style="padding: 3px 8px; background-color: #1976d2; color: white; border-radius: 4px;">
188
+ Score: {scores["score"]}
189
+ </span>
190
+ <span style="padding: 3px 8px; background-color: #2e7d32; color: white; border-radius: 4px;">
191
+ Credibility: {scores["credibility"]}
192
+ </span>
193
+ <span style="padding: 3px 8px; background-color: #ed6c02; color: white; border-radius: 4px;">
194
+ Sentiment: {scores["sentiment"]}
195
+ </span>
196
+ <span style="padding: 3px 8px; background-color: #d32f2f; color: white; border-radius: 4px;">
197
+ Popularity: {scores["popularity"]}
198
+ </span>
199
+ <span style="padding: 3px 8px; background-color: #7b1fa2; color: white; border-radius: 4px;">
200
+ Engagement: {scores["engagement"]}
201
+ </span>
202
+ </div>
203
+ </div>
204
+ '''
205
+ html += '</div>'
206
+ return html
207
+
208
+ def get_recommendations_with_weights(v, s, p):
209
+ weights = RecommendationWeights(v, s, p)
210
+ return format_recommendations(recommendation_system.get_recommendations(weights))
211
+
212
+ submit_btn.click(
213
+ fn=get_recommendations_with_weights,
214
+ inputs=[visibility_weight, sentiment_weight, popularity_weight],
215
+ outputs=output_html
216
+ )
217
+
218
+ return interface
219
+
220
+ def main():
221
+ try:
222
+ recommendation_system = RecommendationSystem(
223
+ data_path=Path('twitter_dataset.csv')
224
+ )
225
+ interface = create_gradio_interface(recommendation_system)
226
+ interface.launch()
227
+ except Exception as e:
228
+ logger.error(f"Application failed to start: {e}")
229
+ raise
230
+
231
+ if __name__ == "__main__":
232
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ pandas
5
+ numpy
6
+ textblob
twitter_dataset.csv ADDED
The diff for this file is too large to render. See raw diff