Spaces:

Sihanas
/

text-summarizer-for-news-articles

Running

App Files Files Community

Sihanas commited on Dec 13, 2024

Commit

0d925f9

verified ·

1 Parent(s): df953e9

Upload 2 files

Browse files

Files changed (2) hide show

app.py +154 -0
requirements.txt +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import os
+# Initialize session state for model and tokenizer
+if 'model' not in st.session_state:
+    st.session_state.model = None
+if 'tokenizer' not in st.session_state:
+    st.session_state.tokenizer = None
+@st.cache_resource
+def load_model():
+    try:
+        # Check if CUDA is available
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load the model
+        model = T5ForConditionalGeneration.from_pretrained('t5-base')
+        # Load the saved weights with appropriate map_location
+        checkpoint = torch.load('abstractive-model-sihanas.pth', map_location=device)
+        model.load_state_dict(checkpoint)
+        model.to(device)
+        # Load tokenizer
+        tokenizer = T5Tokenizer.from_pretrained('t5-base')
+        return model, tokenizer, device
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None, None
+def clean_text(text):
+    """Clean and preprocess the input text"""
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    # Remove very long words (likely garbage)
+    text = ' '.join(word for word in text.split() if len(word) < 100)
+    return text
+def summarize_text(text, model, tokenizer, device):
+    try:
+        # Clean the text
+        cleaned_text = clean_text(text)
+        # Tokenize and generate summary
+        inputs = tokenizer.encode("summarize: " + cleaned_text,
+                                  return_tensors='pt',
+                                  max_length=512,
+                                  truncation=True).to(device)
+        summary_ids = model.generate(
+            inputs,
+            max_length=150,
+            min_length=40,
+            num_beams=4,
+            length_penalty=2.0,
+            early_stopping=True
+        )
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary
+    except Exception as e:
+        st.error(f"Error in summarization: {str(e)}")
+        return None
+def fetch_article(url):
+    """Fetch article content and metadata from URL"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extract metadata
+        title = soup.find('meta', property='og:title') or soup.title
+        title = title.get('content', '').strip() if title else 'No title found'
+        authors = soup.find('meta', {'name': 'author'})
+        authors = authors.get('content', '').strip() if authors else 'No author information'
+        publish_date = soup.find('meta', {'property': 'article:published_time'})
+        publish_date = publish_date.get('content', '').strip() if publish_date else 'No publish date found'
+        publisher = soup.find('meta', {'property': 'og:site_name'})
+        publisher = publisher.get('content', '').strip() if publisher else 'No publisher information'
+        # Remove scripts, styles, and navigation elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer']):
+            element.decompose()
+        text = soup.get_text(separator=' ', strip=True)
+        return title, authors, publish_date, publisher, text
+    except requests.exceptions.RequestException as e:
+        st.error(f"Error fetching the article: {str(e)}")
+        return None, None, None, None, None
+def main():
+    st.title("News Article Summarizer")
+    st.write("Enter a news article URL to get a summary.")
+    # Load model and tokenizer
+    model, tokenizer, device = load_model()
+    if model is None or tokenizer is None:
+        st.error("Failed to load the model. Please check your model file and dependencies.")
+        return
+    # URL input
+    url = st.text_input("News Article URL")
+    if st.button("Summarize"):
+        if not url:
+            st.warning("Please enter a URL")
+            return
+        with st.spinner("Fetching article and generating summary..."):
+            # Fetch article
+            title, authors, publish_date, publisher, article_text = fetch_article(url)
+            if article_text:
+                # Display metadata
+                st.write(f"**Title**: {title}")
+                st.write(f"**Authors**: {authors}")
+                st.write(f"**Publish Date**: {publish_date}")
+                st.write(f"**Publisher**: {publisher}")
+                # Generate summary
+                summary = summarize_text(article_text, model, tokenizer, device)
+                if summary:
+                    st.success("Summary generated successfully!")
+                    st.write("### Summary")
+                    st.write(summary)
+                    # Display original text (collapsed)
+                    with st.expander("Show original article"):
+                        st.write(article_text)
+            else:
+                st.error("Failed to fetch the article. Please check the URL and try again.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,58 @@

+altair==5.5.0
+attrs==24.2.0
+beautifulsoup4==4.12.3
+blinker==1.9.0
+bs4==0.0.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+filelock==3.16.1
+fsspec==2024.10.0
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.26.5
+idna==3.10
+jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+narwhals==1.16.0
+networkx==3.2.1
+numpy==2.0.2
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+protobuf==5.29.1
+pyarrow==18.1.0
+pydeck==0.9.1
+pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+six==1.17.0
+smmap==5.0.1
+soupsieve==2.6
+streamlit==1.40.2
+sympy==1.13.1
+tenacity==9.0.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.47.0
+typing-extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0