Sihanas commited on
Commit
0d925f9
·
verified ·
1 Parent(s): df953e9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +154 -0
  2. requirements.txt +58 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
6
+ import os
7
+
8
+ # Initialize session state for model and tokenizer
9
+ if 'model' not in st.session_state:
10
+ st.session_state.model = None
11
+ if 'tokenizer' not in st.session_state:
12
+ st.session_state.tokenizer = None
13
+
14
+ @st.cache_resource
15
+ def load_model():
16
+ try:
17
+ # Check if CUDA is available
18
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
+
20
+ # Load the model
21
+ model = T5ForConditionalGeneration.from_pretrained('t5-base')
22
+
23
+ # Load the saved weights with appropriate map_location
24
+ checkpoint = torch.load('abstractive-model-sihanas.pth', map_location=device)
25
+
26
+ model.load_state_dict(checkpoint)
27
+ model.to(device)
28
+
29
+ # Load tokenizer
30
+ tokenizer = T5Tokenizer.from_pretrained('t5-base')
31
+
32
+ return model, tokenizer, device
33
+
34
+ except Exception as e:
35
+ st.error(f"Error loading model: {str(e)}")
36
+ return None, None, None
37
+
38
+ def clean_text(text):
39
+ """Clean and preprocess the input text"""
40
+ # Remove extra whitespace
41
+ text = ' '.join(text.split())
42
+ # Remove very long words (likely garbage)
43
+ text = ' '.join(word for word in text.split() if len(word) < 100)
44
+ return text
45
+
46
+ def summarize_text(text, model, tokenizer, device):
47
+ try:
48
+ # Clean the text
49
+ cleaned_text = clean_text(text)
50
+
51
+ # Tokenize and generate summary
52
+ inputs = tokenizer.encode("summarize: " + cleaned_text,
53
+ return_tensors='pt',
54
+ max_length=512,
55
+ truncation=True).to(device)
56
+
57
+ summary_ids = model.generate(
58
+ inputs,
59
+ max_length=150,
60
+ min_length=40,
61
+ num_beams=4,
62
+ length_penalty=2.0,
63
+ early_stopping=True
64
+ )
65
+
66
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
67
+ return summary
68
+
69
+ except Exception as e:
70
+ st.error(f"Error in summarization: {str(e)}")
71
+ return None
72
+
73
+ def fetch_article(url):
74
+ """Fetch article content and metadata from URL"""
75
+ try:
76
+ headers = {
77
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
78
+ }
79
+ response = requests.get(url, headers=headers, timeout=10)
80
+ response.raise_for_status() # Raise an exception for bad status codes
81
+
82
+ soup = BeautifulSoup(response.content, 'html.parser')
83
+
84
+ # Extract metadata
85
+ title = soup.find('meta', property='og:title') or soup.title
86
+ title = title.get('content', '').strip() if title else 'No title found'
87
+
88
+ authors = soup.find('meta', {'name': 'author'})
89
+ authors = authors.get('content', '').strip() if authors else 'No author information'
90
+
91
+ publish_date = soup.find('meta', {'property': 'article:published_time'})
92
+ publish_date = publish_date.get('content', '').strip() if publish_date else 'No publish date found'
93
+
94
+ publisher = soup.find('meta', {'property': 'og:site_name'})
95
+ publisher = publisher.get('content', '').strip() if publisher else 'No publisher information'
96
+
97
+ # Remove scripts, styles, and navigation elements
98
+ for element in soup(['script', 'style', 'nav', 'header', 'footer']):
99
+ element.decompose()
100
+
101
+ text = soup.get_text(separator=' ', strip=True)
102
+
103
+ return title, authors, publish_date, publisher, text
104
+
105
+ except requests.exceptions.RequestException as e:
106
+ st.error(f"Error fetching the article: {str(e)}")
107
+ return None, None, None, None, None
108
+
109
+ def main():
110
+ st.title("News Article Summarizer")
111
+ st.write("Enter a news article URL to get a summary.")
112
+
113
+ # Load model and tokenizer
114
+ model, tokenizer, device = load_model()
115
+
116
+ if model is None or tokenizer is None:
117
+ st.error("Failed to load the model. Please check your model file and dependencies.")
118
+ return
119
+
120
+ # URL input
121
+ url = st.text_input("News Article URL")
122
+
123
+ if st.button("Summarize"):
124
+ if not url:
125
+ st.warning("Please enter a URL")
126
+ return
127
+
128
+ with st.spinner("Fetching article and generating summary..."):
129
+ # Fetch article
130
+ title, authors, publish_date, publisher, article_text = fetch_article(url)
131
+
132
+ if article_text:
133
+ # Display metadata
134
+ st.write(f"**Title**: {title}")
135
+ st.write(f"**Authors**: {authors}")
136
+ st.write(f"**Publish Date**: {publish_date}")
137
+ st.write(f"**Publisher**: {publisher}")
138
+
139
+ # Generate summary
140
+ summary = summarize_text(article_text, model, tokenizer, device)
141
+
142
+ if summary:
143
+ st.success("Summary generated successfully!")
144
+ st.write("### Summary")
145
+ st.write(summary)
146
+
147
+ # Display original text (collapsed)
148
+ with st.expander("Show original article"):
149
+ st.write(article_text)
150
+ else:
151
+ st.error("Failed to fetch the article. Please check the URL and try again.")
152
+
153
+ if __name__ == "__main__":
154
+ main()
requirements.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ attrs==24.2.0
3
+ beautifulsoup4==4.12.3
4
+ blinker==1.9.0
5
+ bs4==0.0.2
6
+ cachetools==5.5.0
7
+ certifi==2024.8.30
8
+ charset-normalizer==3.4.0
9
+ click==8.1.7
10
+ colorama==0.4.6
11
+ filelock==3.16.1
12
+ fsspec==2024.10.0
13
+ gitdb==4.0.11
14
+ GitPython==3.1.43
15
+ huggingface-hub==0.26.5
16
+ idna==3.10
17
+ jinja2==3.1.4
18
+ jsonschema==4.23.0
19
+ jsonschema-specifications==2024.10.1
20
+ markdown-it-py==3.0.0
21
+ MarkupSafe==3.0.2
22
+ mdurl==0.1.2
23
+ mpmath==1.3.0
24
+ narwhals==1.16.0
25
+ networkx==3.2.1
26
+ numpy==2.0.2
27
+ packaging==24.2
28
+ pandas==2.2.3
29
+ pillow==11.0.0
30
+ protobuf==5.29.1
31
+ pyarrow==18.1.0
32
+ pydeck==0.9.1
33
+ pygments==2.18.0
34
+ python-dateutil==2.9.0.post0
35
+ pytz==2024.2
36
+ PyYAML==6.0.2
37
+ referencing==0.35.1
38
+ regex==2024.11.6
39
+ requests==2.32.3
40
+ rich==13.9.4
41
+ rpds-py==0.22.3
42
+ safetensors==0.4.5
43
+ six==1.17.0
44
+ smmap==5.0.1
45
+ soupsieve==2.6
46
+ streamlit==1.40.2
47
+ sympy==1.13.1
48
+ tenacity==9.0.0
49
+ tokenizers==0.21.0
50
+ toml==0.10.2
51
+ torch==2.5.1
52
+ tornado==6.4.2
53
+ tqdm==4.67.1
54
+ transformers==4.47.0
55
+ typing-extensions==4.12.2
56
+ tzdata==2024.2
57
+ urllib3==2.2.3
58
+ watchdog==6.0.0