Spaces:

Sihanas
/

text-summarizer-for-news-articles

Running

App Files Files Community

Sihanas commited on Dec 13, 2024

Commit

190e77d

verified ·

1 Parent(s): 3f5d031

Upload app.py

Browse files

Files changed (1) hide show

app.py +20 -30

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import streamlit as st
-import requests
-from bs4 import BeautifulSoup
 import torch
 from transformers import T5ForConditionalGeneration, T5Tokenizer
-import os
 # Initialize session state for model and tokenizer
 if 'model' not in st.session_state:
@@ -71,41 +70,32 @@ def summarize_text(text, model, tokenizer, device):
         return None
 def fetch_article(url):
-    """Fetch article content and metadata from URL"""
     try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()  # Raise an exception for bad status codes
-        soup = BeautifulSoup(response.content, 'html.parser')
         # Extract metadata
-        title = soup.find('meta', property='og:title') or soup.title
-        title = title.get('content', '').strip() if title else 'No title found'
-        authors = soup.find('meta', {'name': 'author'})
-        authors = authors.get('content', '').strip() if authors else 'No author information'
-        publish_date = soup.find('meta', {'property': 'article:published_time'})
-        publish_date = publish_date.get('content', '').strip() if publish_date else 'No publish date found'
-        publisher = soup.find('meta', {'property': 'og:site_name'})
-        publisher = publisher.get('content', '').strip() if publisher else 'No publisher information'
-        # Remove scripts, styles, and navigation elements
-        for element in soup(['script', 'style', 'nav', 'header', 'footer']):
-            element.decompose()
-        text = soup.get_text(separator=' ', strip=True)
-        return title, authors, publish_date, publisher, text
-    except requests.exceptions.RequestException as e:
         st.error(f"Error fetching the article: {str(e)}")
         return None, None, None, None, None
 def main():
     st.title("News Article Summarizer")
     st.write("Enter a news article URL to get a summary.")
@@ -151,4 +141,4 @@ def main():
                 st.error("Failed to fetch the article. Please check the URL and try again.")
 if __name__ == "__main__":
-    main()

 import streamlit as st
+import newspaper
 import torch
 from transformers import T5ForConditionalGeneration, T5Tokenizer
+from urllib.parse import urlparse
 # Initialize session state for model and tokenizer
 if 'model' not in st.session_state:
         return None
 def fetch_article(url):
+    """Fetch article content and metadata from URL using newspaper3k"""
     try:
+        # Download and parse the article
+        article = newspaper.Article(url)
+        # Enable extraction of all possible metadata
+        article.download()
+        article.parse()
         # Extract metadata
+        title = article.title or 'No title found'
+        authors = ', '.join(article.authors) if article.authors else 'No author information'
+        publish_date = article.publish_date or 'No publish date found'
+        # Extract publisher from URL domain
+        publisher = urlparse(url).netloc.replace('www.', '').capitalize() or 'No publisher information'
+        # Get the main text content
+        text = article.text or ''
+        return title, authors, str(publish_date), publisher, text
+    except Exception as e:
         st.error(f"Error fetching the article: {str(e)}")
         return None, None, None, None, None
 def main():
     st.title("News Article Summarizer")
     st.write("Enter a news article URL to get a summary.")
                 st.error("Failed to fetch the article. Please check the URL and try again.")
 if __name__ == "__main__":
+    main()