loayshabet commited on
Commit
9e406c0
·
verified ·
1 Parent(s): 602dc07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -95
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import json
@@ -20,119 +20,211 @@ logging.basicConfig(
20
  format='%(asctime)s - %(levelname)s - %(message)s'
21
  )
22
 
23
- # Language codes and their corresponding MarianMT model names
24
  LANGUAGE_CODES = {
25
- "English": {"code": "en", "model": None}, # No translation needed for English
26
- "Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
27
- "French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
28
- "German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
29
- "Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
30
- "Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
31
- "Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
32
- "Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
33
- "Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
34
- "Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
35
- "Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
36
  }
37
 
38
- # [Previous NEWS_SOURCES definition remains the same...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Initialize global variables
41
  summarizer = None
42
- translators = {}
43
 
44
  class NewsCache:
45
  def __init__(self):
46
  self.summaries = {}
47
- self.translations = {}
48
  self.max_cache_size = 1000
49
 
50
- def store_summary(self, content_hash, summary, language=None):
51
- cache_key = f"{content_hash}_{language}" if language else content_hash
52
-
53
  if len(self.summaries) >= self.max_cache_size:
54
  # Remove oldest entry if cache is full
55
  self.summaries.pop(next(iter(self.summaries)))
56
-
57
- self.summaries[cache_key] = summary
58
 
59
- def get_summary(self, content_hash, language=None):
60
- cache_key = f"{content_hash}_{language}" if language else content_hash
61
- return self.summaries.get(cache_key)
62
 
63
  news_cache = NewsCache()
64
 
65
- def initialize_models():
66
- """Initialize the summarization and translation models"""
67
- global summarizer, translators
68
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- # Initialize summarizer
71
  summarizer = pipeline(
72
  "summarization",
73
  model="facebook/bart-large-cnn",
74
  device=-1 # Use CPU
75
  )
76
-
77
- # Initialize translators for each language
78
- for lang, info in LANGUAGE_CODES.items():
79
- if info["model"]: # Skip English as it doesn't need translation
80
- try:
81
- model = AutoModelForSeq2SeqGeneration.from_pretrained(info["model"])
82
- tokenizer = AutoTokenizer.from_pretrained(info["model"])
83
- translators[lang] = (model, tokenizer)
84
- logging.info(f"Initialized translator for {lang}")
85
- except Exception as e:
86
- logging.error(f"Error initializing translator for {lang}: {e}")
87
-
88
  return True
89
  except Exception as e:
90
- logging.error(f"Error initializing models: {e}")
91
  return False
92
 
93
- def translate_text(text, target_language):
94
- """Translate text to target language"""
95
- if target_language == "English" or not text:
96
- return text
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  try:
99
- if target_language not in translators:
100
- logging.error(f"Translator not found for {target_language}")
101
- return text
102
 
103
- model, tokenizer = translators[target_language]
104
-
105
- # Split text into chunks to handle long text
106
- max_length = 512
107
- chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
108
- translated_chunks = []
109
-
110
- for chunk in chunks:
111
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
112
- translated = model.generate(**inputs)
113
- translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
114
- translated_chunks.append(translated_text)
115
 
116
- return " ".join(translated_chunks)
117
-
 
118
  except Exception as e:
119
- logging.error(f"Translation error: {e}")
120
- return text
121
 
122
- def generate_summary(text, title="", category="", language="English"):
123
- """Generate summary with translation support"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if not summarizer:
125
- if not initialize_models():
126
  return None
127
 
128
  try:
129
  # Check cache first
130
  content_hash = get_content_hash(text)
131
- cached_summary = news_cache.get_summary(content_hash, language)
132
  if cached_summary:
133
  return cached_summary
134
 
135
- # Generate English summary first
136
  prompt_template = f"""
137
  Analyze and summarize this {category} news article titled "{title}".
138
  Focus on providing:
@@ -147,6 +239,7 @@ Article text:
147
 
148
  Please provide a clear, concise summary that a general audience can understand:"""
149
 
 
150
  prompted_text = prompt_template.format(text=text[:1024])
151
 
152
  result = summarizer(prompted_text,
@@ -158,16 +251,12 @@ Please provide a clear, concise summary that a general audience can understand:"
158
  if result and len(result) > 0:
159
  summary = result[0]['summary_text']
160
 
161
- # Post-process summary
162
  summary = summary.replace(" .", ".").replace(" ,", ",")
163
  sentences = summary.split(". ")
164
  formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
165
 
166
- # Translate if needed
167
- if language != "English":
168
- formatted_summary = translate_text(formatted_summary, language)
169
-
170
- news_cache.store_summary(content_hash, formatted_summary, language)
171
  return formatted_summary
172
 
173
  return None
@@ -177,7 +266,7 @@ Please provide a clear, concise summary that a general audience can understand:"
177
  return None
178
 
179
  def get_personalized_summary(name, progress=gr.Progress()):
180
- """Generate personalized news summary in user's preferred language"""
181
  start_time = time.time()
182
  logging.info(f"Starting summary generation for user: {name}")
183
 
@@ -192,21 +281,19 @@ def get_personalized_summary(name, progress=gr.Progress()):
192
  except Exception as e:
193
  return f"Error loading preferences: {e}"
194
 
195
- user_language = preferences.get("language", "English")
196
-
197
  # Fetch articles with progress
198
  progress(0.2, desc="Fetching recent news...")
199
  articles = fetch_news_from_rss(preferences["interests"])
200
 
201
  if not articles:
202
- return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)
203
 
204
  # Process articles with timeout
205
  progress(0.4, desc="Analyzing and summarizing...")
206
  summaries = []
207
  total_articles = len(articles)
208
 
209
- max_processing_time = 60
210
 
211
  for i, article in enumerate(articles):
212
  if time.time() - start_time > max_processing_time:
@@ -226,24 +313,18 @@ def get_personalized_summary(name, progress=gr.Progress()):
226
  if not content:
227
  continue
228
 
229
- summary = generate_summary(content, title, category, user_language)
230
  if not summary:
231
  continue
232
 
233
- # Translate title and category if needed
234
- if user_language != "English":
235
- title = translate_text(title, user_language)
236
- category = translate_text(category, user_language)
237
- published_str = translate_text(published_str, user_language)
238
-
239
  formatted_summary = f"""
240
  📰 {title}
241
- 📁 {translate_text("Category", user_language)}: {category}
242
- {translate_text("Published", user_language)}: {published_str}
243
 
244
  {summary}
245
 
246
- 🔗 {translate_text("Read more", user_language)}: {link}
247
 
248
  ---"""
249
  summaries.append(formatted_summary)
@@ -253,13 +334,11 @@ def get_personalized_summary(name, progress=gr.Progress()):
253
  continue
254
 
255
  if not summaries:
256
- return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)
257
 
258
  progress(1.0, desc="Done!")
259
  return "\n".join(summaries)
260
 
261
- # [Rest of the code remains the same...]
262
-
263
  # Gradio interface
264
  with gr.Blocks(title="Enhanced News Summarizer") as demo:
265
  gr.Markdown("# 📰 Enhanced AI News Summarizer")
@@ -319,7 +398,7 @@ with gr.Blocks(title="Enhanced News Summarizer") as demo:
319
  )
320
 
321
  if __name__ == "__main__":
322
- if initialize_models():
323
  demo.launch()
324
  else:
325
  print("Failed to initialize summarizer. Please check the logs.")
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import json
 
20
  format='%(asctime)s - %(levelname)s - %(message)s'
21
  )
22
 
23
+ # Language codes for supported languages
24
  LANGUAGE_CODES = {
25
+ "English": "en",
26
+ "Spanish": "es",
27
+ "French": "fr",
28
+ "German": "de",
29
+ "Italian": "it",
30
+ "Portuguese": "pt",
31
+ "Dutch": "nl",
32
+ "Russian": "ru",
33
+ "Chinese": "zh",
34
+ "Japanese": "ja",
35
+ "Arabic": "ar" # Added Arabic support
36
  }
37
 
38
+ # News sources organized by category
39
+ NEWS_SOURCES = {
40
+ "Technology": [
41
+ "https://feeds.feedburner.com/TechCrunch/",
42
+ "https://www.theverge.com/rss/index.xml",
43
+ "https://www.wired.com/feed/rss",
44
+ "https://feeds.feedburner.com/TheNextWeb" # Added for more variety
45
+ ],
46
+ "Business": [
47
+ "https://feeds.feedburner.com/forbes/business",
48
+ "https://www.ft.com/rss/home",
49
+ "https://feeds.bloomberg.com/markets/news.rss",
50
+ "https://www.aljazeera.com/xml/rss/all.xml" # Added Arabic business news
51
+ ],
52
+ "Science": [
53
+ "https://rss.sciencedaily.com/all.xml",
54
+ "https://www.nature.com/nature.rss",
55
+ "https://science.nasa.gov/rss.xml"
56
+ ],
57
+ "Health": [
58
+ "https://rss.medicalnewstoday.com/newsfeeds/medical_all.xml",
59
+ "https://www.who.int/rss-feeds/news-english.xml",
60
+ "https://www.healthline.com/rss/news"
61
+ ],
62
+ "World News": [
63
+ "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
64
+ "https://feeds.bbci.co.uk/news/world/rss.xml",
65
+ "https://www.reuters.com/rssFeed/world",
66
+ "https://arabic.cnn.com/rss" # Added Arabic news source
67
+ ]
68
+ }
69
 
70
  # Initialize global variables
71
  summarizer = None
 
72
 
73
  class NewsCache:
74
  def __init__(self):
75
  self.summaries = {}
 
76
  self.max_cache_size = 1000
77
 
78
+ def store_summary(self, content_hash, summary):
 
 
79
  if len(self.summaries) >= self.max_cache_size:
80
  # Remove oldest entry if cache is full
81
  self.summaries.pop(next(iter(self.summaries)))
82
+ self.summaries[content_hash] = summary
 
83
 
84
+ def get_summary(self, content_hash):
85
+ return self.summaries.get(content_hash)
 
86
 
87
  news_cache = NewsCache()
88
 
89
+ def get_content_hash(content):
90
+ """Generate hash for content to use as cache key"""
91
+ return hashlib.md5(content.encode()).hexdigest()
92
+
93
+ def clean_text(text):
94
+ """Clean and normalize text content"""
95
+ if not text:
96
+ return ""
97
+ # Remove HTML tags and normalize whitespace
98
+ text = BeautifulSoup(text, "html.parser").get_text()
99
+ return " ".join(text.split())
100
+
101
+ @lru_cache(maxsize=100)
102
+ def fetch_feed_with_timeout(url):
103
+ """Fetch RSS feed with timeout and caching"""
104
+ try:
105
+ response = requests.get(url, timeout=10)
106
+ return feedparser.parse(response.content)
107
+ except Exception as e:
108
+ logging.error(f"Error fetching feed {url}: {e}")
109
+ return None
110
+
111
+ def initialize_summarizer():
112
+ """Initialize the summarization pipeline"""
113
+ global summarizer
114
  try:
 
115
  summarizer = pipeline(
116
  "summarization",
117
  model="facebook/bart-large-cnn",
118
  device=-1 # Use CPU
119
  )
 
 
 
 
 
 
 
 
 
 
 
 
120
  return True
121
  except Exception as e:
122
+ logging.error(f"Error initializing summarizer: {e}")
123
  return False
124
 
125
+ def parse_date(date_str):
126
+ """Parse various date formats to datetime"""
127
+ try:
128
+ # Try parsing RSS/Atom date format
129
+ return parsedate_to_datetime(date_str)
130
+ except (TypeError, ValueError):
131
+ try:
132
+ # Try ISO format
133
+ return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
134
+ except (TypeError, ValueError):
135
+ return None
136
+
137
+ def is_recent_article(published_date, hours=8):
138
+ """Check if article is within the last specified hours"""
139
+ if not published_date:
140
+ return False
141
 
142
  try:
143
+ parsed_date = parse_date(published_date)
144
+ if not parsed_date:
145
+ return False
146
 
147
+ # Ensure timezone awareness
148
+ if parsed_date.tzinfo is None:
149
+ parsed_date = pytz.UTC.localize(parsed_date)
 
 
 
 
 
 
 
 
 
150
 
151
+ now = datetime.now(pytz.UTC)
152
+ time_difference = now - parsed_date
153
+ return time_difference <= timedelta(hours=hours)
154
  except Exception as e:
155
+ logging.error(f"Error parsing date: {e}")
156
+ return False
157
 
158
+ def fetch_news_from_rss(interests):
159
+ """Fetch recent news from RSS feeds"""
160
+ articles = []
161
+ max_articles_per_category = 2
162
+
163
+ with ThreadPoolExecutor(max_workers=3) as executor:
164
+ for interest in interests:
165
+ if interest not in NEWS_SOURCES:
166
+ continue
167
+
168
+ future_to_url = {
169
+ executor.submit(fetch_feed_with_timeout, url): url
170
+ for url in NEWS_SOURCES[interest]
171
+ }
172
+
173
+ category_count = 0
174
+ for future in future_to_url:
175
+ if category_count >= max_articles_per_category:
176
+ break
177
+
178
+ try:
179
+ feed = future.result(timeout=15)
180
+ if not feed:
181
+ continue
182
+
183
+ for entry in feed.entries:
184
+ published_date = entry.get('published', '') or entry.get('updated', '')
185
+
186
+ if not is_recent_article(published_date):
187
+ continue
188
+
189
+ description = entry.get('description', '') or entry.get('summary', '')
190
+ description = clean_text(description)
191
+
192
+ if len(description) < 50:
193
+ continue
194
+
195
+ article = {
196
+ 'title': clean_text(entry.get('title', 'Untitled')),
197
+ 'description': description,
198
+ 'category': interest,
199
+ 'link': entry.get('link', ''),
200
+ 'published': published_date
201
+ }
202
+ articles.append(article)
203
+ category_count += 1
204
+
205
+ if category_count >= max_articles_per_category:
206
+ break
207
+
208
+ except (TimeoutError, Exception) as e:
209
+ logging.error(f"Error processing feed: {e}")
210
+ continue
211
+
212
+ return articles
213
+
214
+ def generate_summary(text, title="", category=""):
215
+ """Generate summary with enhanced prompting"""
216
  if not summarizer:
217
+ if not initialize_summarizer():
218
  return None
219
 
220
  try:
221
  # Check cache first
222
  content_hash = get_content_hash(text)
223
+ cached_summary = news_cache.get_summary(content_hash)
224
  if cached_summary:
225
  return cached_summary
226
 
227
+ # Enhanced prompt template for better summaries
228
  prompt_template = f"""
229
  Analyze and summarize this {category} news article titled "{title}".
230
  Focus on providing:
 
239
 
240
  Please provide a clear, concise summary that a general audience can understand:"""
241
 
242
+ # Prepare input text
243
  prompted_text = prompt_template.format(text=text[:1024])
244
 
245
  result = summarizer(prompted_text,
 
251
  if result and len(result) > 0:
252
  summary = result[0]['summary_text']
253
 
254
+ # Post-process summary for better readability
255
  summary = summary.replace(" .", ".").replace(" ,", ",")
256
  sentences = summary.split(". ")
257
  formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
258
 
259
+ news_cache.store_summary(content_hash, formatted_summary)
 
 
 
 
260
  return formatted_summary
261
 
262
  return None
 
266
  return None
267
 
268
  def get_personalized_summary(name, progress=gr.Progress()):
269
+ """Generate personalized news summary"""
270
  start_time = time.time()
271
  logging.info(f"Starting summary generation for user: {name}")
272
 
 
281
  except Exception as e:
282
  return f"Error loading preferences: {e}"
283
 
 
 
284
  # Fetch articles with progress
285
  progress(0.2, desc="Fetching recent news...")
286
  articles = fetch_news_from_rss(preferences["interests"])
287
 
288
  if not articles:
289
+ return "No recent news articles found from the last 8 hours. Please try again later."
290
 
291
  # Process articles with timeout
292
  progress(0.4, desc="Analyzing and summarizing...")
293
  summaries = []
294
  total_articles = len(articles)
295
 
296
+ max_processing_time = 60 # Maximum processing time in seconds
297
 
298
  for i, article in enumerate(articles):
299
  if time.time() - start_time > max_processing_time:
 
313
  if not content:
314
  continue
315
 
316
+ summary = generate_summary(content, title, category)
317
  if not summary:
318
  continue
319
 
 
 
 
 
 
 
320
  formatted_summary = f"""
321
  📰 {title}
322
+ 📁 Category: {category}
323
+ ⏰ Published: {published_str}
324
 
325
  {summary}
326
 
327
+ 🔗 Read more: {link}
328
 
329
  ---"""
330
  summaries.append(formatted_summary)
 
334
  continue
335
 
336
  if not summaries:
337
+ return "Unable to generate summaries for recent news. Please try again."
338
 
339
  progress(1.0, desc="Done!")
340
  return "\n".join(summaries)
341
 
 
 
342
  # Gradio interface
343
  with gr.Blocks(title="Enhanced News Summarizer") as demo:
344
  gr.Markdown("# 📰 Enhanced AI News Summarizer")
 
398
  )
399
 
400
  if __name__ == "__main__":
401
+ if initialize_summarizer():
402
  demo.launch()
403
  else:
404
  print("Failed to initialize summarizer. Please check the logs.")