Spaces:

loayshabet
/

news-sumarry

Sleeping

App Files Files Community

loayshabet commited on Nov 17, 2024

Commit

9e406c0

verified ·

1 Parent(s): 602dc07

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -95

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
 import feedparser
 from datetime import datetime, timedelta
 import json
@@ -20,119 +20,211 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# Language codes and their corresponding MarianMT model names
 LANGUAGE_CODES = {
-    "English": {"code": "en", "model": None},  # No translation needed for English
-    "Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
-    "French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
-    "German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
-    "Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
-    "Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
-    "Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
-    "Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
-    "Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
-    "Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
-    "Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
 }
-# [Previous NEWS_SOURCES definition remains the same...]
 # Initialize global variables
 summarizer = None
-translators = {}
 class NewsCache:
     def __init__(self):
         self.summaries = {}
-        self.translations = {}
         self.max_cache_size = 1000
-    def store_summary(self, content_hash, summary, language=None):
-        cache_key = f"{content_hash}_{language}" if language else content_hash
         if len(self.summaries) >= self.max_cache_size:
             # Remove oldest entry if cache is full
             self.summaries.pop(next(iter(self.summaries)))
-        self.summaries[cache_key] = summary
-    def get_summary(self, content_hash, language=None):
-        cache_key = f"{content_hash}_{language}" if language else content_hash
-        return self.summaries.get(cache_key)
 news_cache = NewsCache()
-def initialize_models():
-    """Initialize the summarization and translation models"""
-    global summarizer, translators
     try:
-        # Initialize summarizer
         summarizer = pipeline(
             "summarization",
             model="facebook/bart-large-cnn",
             device=-1  # Use CPU
         )
-        # Initialize translators for each language
-        for lang, info in LANGUAGE_CODES.items():
-            if info["model"]:  # Skip English as it doesn't need translation
-                try:
-                    model = AutoModelForSeq2SeqGeneration.from_pretrained(info["model"])
-                    tokenizer = AutoTokenizer.from_pretrained(info["model"])
-                    translators[lang] = (model, tokenizer)
-                    logging.info(f"Initialized translator for {lang}")
-                except Exception as e:
-                    logging.error(f"Error initializing translator for {lang}: {e}")
         return True
     except Exception as e:
-        logging.error(f"Error initializing models: {e}")
         return False
-def translate_text(text, target_language):
-    """Translate text to target language"""
-    if target_language == "English" or not text:
-        return text
     try:
-        if target_language not in translators:
-            logging.error(f"Translator not found for {target_language}")
-            return text
-        model, tokenizer = translators[target_language]
-        # Split text into chunks to handle long text
-        max_length = 512
-        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
-        translated_chunks = []
-        for chunk in chunks:
-            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
-            translated = model.generate(**inputs)
-            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
-            translated_chunks.append(translated_text)
-        return " ".join(translated_chunks)
     except Exception as e:
-        logging.error(f"Translation error: {e}")
-        return text
-def generate_summary(text, title="", category="", language="English"):
-    """Generate summary with translation support"""
     if not summarizer:
-        if not initialize_models():
             return None
     try:
         # Check cache first
         content_hash = get_content_hash(text)
-        cached_summary = news_cache.get_summary(content_hash, language)
         if cached_summary:
             return cached_summary
-        # Generate English summary first
         prompt_template = f"""
 Analyze and summarize this {category} news article titled "{title}".
 Focus on providing:
@@ -147,6 +239,7 @@ Article text:
 Please provide a clear, concise summary that a general audience can understand:"""
         prompted_text = prompt_template.format(text=text[:1024])
         result = summarizer(prompted_text,
@@ -158,16 +251,12 @@ Please provide a clear, concise summary that a general audience can understand:"
         if result and len(result) > 0:
             summary = result[0]['summary_text']
-            # Post-process summary
             summary = summary.replace(" .", ".").replace(" ,", ",")
             sentences = summary.split(". ")
             formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
-            # Translate if needed
-            if language != "English":
-                formatted_summary = translate_text(formatted_summary, language)
-            news_cache.store_summary(content_hash, formatted_summary, language)
             return formatted_summary
         return None
@@ -177,7 +266,7 @@ Please provide a clear, concise summary that a general audience can understand:"
         return None
 def get_personalized_summary(name, progress=gr.Progress()):
-    """Generate personalized news summary in user's preferred language"""
     start_time = time.time()
     logging.info(f"Starting summary generation for user: {name}")
@@ -192,21 +281,19 @@ def get_personalized_summary(name, progress=gr.Progress()):
     except Exception as e:
         return f"Error loading preferences: {e}"
-    user_language = preferences.get("language", "English")
     # Fetch articles with progress
     progress(0.2, desc="Fetching recent news...")
     articles = fetch_news_from_rss(preferences["interests"])
     if not articles:
-        return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)
     # Process articles with timeout
     progress(0.4, desc="Analyzing and summarizing...")
     summaries = []
     total_articles = len(articles)
-    max_processing_time = 60
     for i, article in enumerate(articles):
         if time.time() - start_time > max_processing_time:
@@ -226,24 +313,18 @@ def get_personalized_summary(name, progress=gr.Progress()):
             if not content:
                 continue
-            summary = generate_summary(content, title, category, user_language)
             if not summary:
                 continue
-            # Translate title and category if needed
-            if user_language != "English":
-                title = translate_text(title, user_language)
-                category = translate_text(category, user_language)
-                published_str = translate_text(published_str, user_language)
             formatted_summary = f"""
 📰 {title}
-📁 {translate_text("Category", user_language)}: {category}
-⏰ {translate_text("Published", user_language)}: {published_str}
 {summary}
-🔗 {translate_text("Read more", user_language)}: {link}
 ---"""
             summaries.append(formatted_summary)
@@ -253,13 +334,11 @@ def get_personalized_summary(name, progress=gr.Progress()):
             continue
     if not summaries:
-        return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)
     progress(1.0, desc="Done!")
     return "\n".join(summaries)
-# [Rest of the code remains the same...]
 # Gradio interface
 with gr.Blocks(title="Enhanced News Summarizer") as demo:
     gr.Markdown("# 📰 Enhanced AI News Summarizer")
@@ -319,7 +398,7 @@ with gr.Blocks(title="Enhanced News Summarizer") as demo:
         )
 if __name__ == "__main__":
-    if initialize_models():
         demo.launch()
     else:
         print("Failed to initialize summarizer. Please check the logs.")

 import gradio as gr
+from transformers import pipeline
 import feedparser
 from datetime import datetime, timedelta
 import json
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+# Language codes for supported languages
 LANGUAGE_CODES = {
+    "English": "en",
+    "Spanish": "es",
+    "French": "fr",
+    "German": "de",
+    "Italian": "it",
+    "Portuguese": "pt",
+    "Dutch": "nl",
+    "Russian": "ru",
+    "Chinese": "zh",
+    "Japanese": "ja",
+    "Arabic": "ar"  # Added Arabic support
 }
+# News sources organized by category
+NEWS_SOURCES = {
+    "Technology": [
+        "https://feeds.feedburner.com/TechCrunch/",
+        "https://www.theverge.com/rss/index.xml",
+        "https://www.wired.com/feed/rss",
+        "https://feeds.feedburner.com/TheNextWeb"  # Added for more variety
+    ],
+    "Business": [
+        "https://feeds.feedburner.com/forbes/business",
+        "https://www.ft.com/rss/home",
+        "https://feeds.bloomberg.com/markets/news.rss",
+        "https://www.aljazeera.com/xml/rss/all.xml"  # Added Arabic business news
+    ],
+    "Science": [
+        "https://rss.sciencedaily.com/all.xml",
+        "https://www.nature.com/nature.rss",
+        "https://science.nasa.gov/rss.xml"
+    ],
+    "Health": [
+        "https://rss.medicalnewstoday.com/newsfeeds/medical_all.xml",
+        "https://www.who.int/rss-feeds/news-english.xml",
+        "https://www.healthline.com/rss/news"
+    ],
+    "World News": [
+        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
+        "https://feeds.bbci.co.uk/news/world/rss.xml",
+        "https://www.reuters.com/rssFeed/world",
+        "https://arabic.cnn.com/rss"  # Added Arabic news source
+    ]
+}
 # Initialize global variables
 summarizer = None
 class NewsCache:
     def __init__(self):
         self.summaries = {}
         self.max_cache_size = 1000
+    def store_summary(self, content_hash, summary):
         if len(self.summaries) >= self.max_cache_size:
             # Remove oldest entry if cache is full
             self.summaries.pop(next(iter(self.summaries)))
+        self.summaries[content_hash] = summary
+    def get_summary(self, content_hash):
+        return self.summaries.get(content_hash)
 news_cache = NewsCache()
+def get_content_hash(content):
+    """Generate hash for content to use as cache key"""
+    return hashlib.md5(content.encode()).hexdigest()
+def clean_text(text):
+    """Clean and normalize text content"""
+    if not text:
+        return ""
+    # Remove HTML tags and normalize whitespace
+    text = BeautifulSoup(text, "html.parser").get_text()
+    return " ".join(text.split())
+@lru_cache(maxsize=100)
+def fetch_feed_with_timeout(url):
+    """Fetch RSS feed with timeout and caching"""
+    try:
+        response = requests.get(url, timeout=10)
+        return feedparser.parse(response.content)
+    except Exception as e:
+        logging.error(f"Error fetching feed {url}: {e}")
+        return None
+def initialize_summarizer():
+    """Initialize the summarization pipeline"""
+    global summarizer
     try:
         summarizer = pipeline(
             "summarization",
             model="facebook/bart-large-cnn",
             device=-1  # Use CPU
         )
         return True
     except Exception as e:
+        logging.error(f"Error initializing summarizer: {e}")
         return False
+def parse_date(date_str):
+    """Parse various date formats to datetime"""
+    try:
+        # Try parsing RSS/Atom date format
+        return parsedate_to_datetime(date_str)
+    except (TypeError, ValueError):
+        try:
+            # Try ISO format
+            return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
+        except (TypeError, ValueError):
+            return None
+def is_recent_article(published_date, hours=8):
+    """Check if article is within the last specified hours"""
+    if not published_date:
+        return False
     try:
+        parsed_date = parse_date(published_date)
+        if not parsed_date:
+            return False
+        # Ensure timezone awareness
+        if parsed_date.tzinfo is None:
+            parsed_date = pytz.UTC.localize(parsed_date)
+        now = datetime.now(pytz.UTC)
+        time_difference = now - parsed_date
+        return time_difference <= timedelta(hours=hours)
     except Exception as e:
+        logging.error(f"Error parsing date: {e}")
+        return False
+def fetch_news_from_rss(interests):
+    """Fetch recent news from RSS feeds"""
+    articles = []
+    max_articles_per_category = 2
+    with ThreadPoolExecutor(max_workers=3) as executor:
+        for interest in interests:
+            if interest not in NEWS_SOURCES:
+                continue
+            future_to_url = {
+                executor.submit(fetch_feed_with_timeout, url): url
+                for url in NEWS_SOURCES[interest]
+            }
+            category_count = 0
+            for future in future_to_url:
+                if category_count >= max_articles_per_category:
+                    break
+                try:
+                    feed = future.result(timeout=15)
+                    if not feed:
+                        continue
+                    for entry in feed.entries:
+                        published_date = entry.get('published', '') or entry.get('updated', '')
+                        if not is_recent_article(published_date):
+                            continue
+                        description = entry.get('description', '') or entry.get('summary', '')
+                        description = clean_text(description)
+                        if len(description) < 50:
+                            continue
+                        article = {
+                            'title': clean_text(entry.get('title', 'Untitled')),
+                            'description': description,
+                            'category': interest,
+                            'link': entry.get('link', ''),
+                            'published': published_date
+                        }
+                        articles.append(article)
+                        category_count += 1
+                        if category_count >= max_articles_per_category:
+                            break
+                except (TimeoutError, Exception) as e:
+                    logging.error(f"Error processing feed: {e}")
+                    continue
+    return articles
+def generate_summary(text, title="", category=""):
+    """Generate summary with enhanced prompting"""
     if not summarizer:
+        if not initialize_summarizer():
             return None
     try:
         # Check cache first
         content_hash = get_content_hash(text)
+        cached_summary = news_cache.get_summary(content_hash)
         if cached_summary:
             return cached_summary
+        # Enhanced prompt template for better summaries
         prompt_template = f"""
 Analyze and summarize this {category} news article titled "{title}".
 Focus on providing:
 Please provide a clear, concise summary that a general audience can understand:"""
+        # Prepare input text
         prompted_text = prompt_template.format(text=text[:1024])
         result = summarizer(prompted_text,
         if result and len(result) > 0:
             summary = result[0]['summary_text']
+            # Post-process summary for better readability
             summary = summary.replace(" .", ".").replace(" ,", ",")
             sentences = summary.split(". ")
             formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
+            news_cache.store_summary(content_hash, formatted_summary)
             return formatted_summary
         return None
         return None
 def get_personalized_summary(name, progress=gr.Progress()):
+    """Generate personalized news summary"""
     start_time = time.time()
     logging.info(f"Starting summary generation for user: {name}")
     except Exception as e:
         return f"Error loading preferences: {e}"
     # Fetch articles with progress
     progress(0.2, desc="Fetching recent news...")
     articles = fetch_news_from_rss(preferences["interests"])
     if not articles:
+        return "No recent news articles found from the last 8 hours. Please try again later."
     # Process articles with timeout
     progress(0.4, desc="Analyzing and summarizing...")
     summaries = []
     total_articles = len(articles)
+    max_processing_time = 60  # Maximum processing time in seconds
     for i, article in enumerate(articles):
         if time.time() - start_time > max_processing_time:
             if not content:
                 continue
+            summary = generate_summary(content, title, category)
             if not summary:
                 continue
             formatted_summary = f"""
 📰 {title}
+📁 Category: {category}
+⏰ Published: {published_str}
 {summary}
+🔗 Read more: {link}
 ---"""
             summaries.append(formatted_summary)
             continue
     if not summaries:
+        return "Unable to generate summaries for recent news. Please try again."
     progress(1.0, desc="Done!")
     return "\n".join(summaries)
 # Gradio interface
 with gr.Blocks(title="Enhanced News Summarizer") as demo:
     gr.Markdown("# 📰 Enhanced AI News Summarizer")
         )
 if __name__ == "__main__":
+    if initialize_summarizer():
         demo.launch()
     else:
         print("Failed to initialize summarizer. Please check the logs.")