Sihanas commited on
Commit
190e77d
·
verified ·
1 Parent(s): 3f5d031

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -30
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
  import torch
5
  from transformers import T5ForConditionalGeneration, T5Tokenizer
6
- import os
7
 
8
  # Initialize session state for model and tokenizer
9
  if 'model' not in st.session_state:
@@ -71,41 +70,32 @@ def summarize_text(text, model, tokenizer, device):
71
  return None
72
 
73
  def fetch_article(url):
74
- """Fetch article content and metadata from URL"""
75
  try:
76
- headers = {
77
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
78
- }
79
- response = requests.get(url, headers=headers, timeout=10)
80
- response.raise_for_status() # Raise an exception for bad status codes
81
 
82
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
83
 
84
  # Extract metadata
85
- title = soup.find('meta', property='og:title') or soup.title
86
- title = title.get('content', '').strip() if title else 'No title found'
87
-
88
- authors = soup.find('meta', {'name': 'author'})
89
- authors = authors.get('content', '').strip() if authors else 'No author information'
90
-
91
- publish_date = soup.find('meta', {'property': 'article:published_time'})
92
- publish_date = publish_date.get('content', '').strip() if publish_date else 'No publish date found'
93
-
94
- publisher = soup.find('meta', {'property': 'og:site_name'})
95
- publisher = publisher.get('content', '').strip() if publisher else 'No publisher information'
96
 
97
- # Remove scripts, styles, and navigation elements
98
- for element in soup(['script', 'style', 'nav', 'header', 'footer']):
99
- element.decompose()
100
-
101
- text = soup.get_text(separator=' ', strip=True)
102
 
103
- return title, authors, publish_date, publisher, text
 
 
 
104
 
105
- except requests.exceptions.RequestException as e:
106
  st.error(f"Error fetching the article: {str(e)}")
107
  return None, None, None, None, None
108
-
109
  def main():
110
  st.title("News Article Summarizer")
111
  st.write("Enter a news article URL to get a summary.")
@@ -151,4 +141,4 @@ def main():
151
  st.error("Failed to fetch the article. Please check the URL and try again.")
152
 
153
  if __name__ == "__main__":
154
- main()
 
1
  import streamlit as st
2
+ import newspaper
 
3
  import torch
4
  from transformers import T5ForConditionalGeneration, T5Tokenizer
5
+ from urllib.parse import urlparse
6
 
7
  # Initialize session state for model and tokenizer
8
  if 'model' not in st.session_state:
 
70
  return None
71
 
72
  def fetch_article(url):
73
+ """Fetch article content and metadata from URL using newspaper3k"""
74
  try:
75
+ # Download and parse the article
76
+ article = newspaper.Article(url)
 
 
 
77
 
78
+ # Enable extraction of all possible metadata
79
+ article.download()
80
+ article.parse()
81
 
82
  # Extract metadata
83
+ title = article.title or 'No title found'
84
+ authors = ', '.join(article.authors) if article.authors else 'No author information'
85
+ publish_date = article.publish_date or 'No publish date found'
 
 
 
 
 
 
 
 
86
 
87
+ # Extract publisher from URL domain
88
+ publisher = urlparse(url).netloc.replace('www.', '').capitalize() or 'No publisher information'
 
 
 
89
 
90
+ # Get the main text content
91
+ text = article.text or ''
92
+
93
+ return title, authors, str(publish_date), publisher, text
94
 
95
+ except Exception as e:
96
  st.error(f"Error fetching the article: {str(e)}")
97
  return None, None, None, None, None
98
+
99
  def main():
100
  st.title("News Article Summarizer")
101
  st.write("Enter a news article URL to get a summary.")
 
141
  st.error("Failed to fetch the article. Please check the URL and try again.")
142
 
143
  if __name__ == "__main__":
144
+ main()