Spaces:

tyriaa
/

RSS_AI_Reader

Sleeping

App Files Files Community

tyriaa commited on May 16, 2025

Commit

26848d5

1 Parent(s): f4e203d

Mise à jour de l'application avec le tableau de bord d'actualités IA

Browse files

Files changed (4) hide show

README.md +17 -4
requirements.txt +7 -3
src/fetch_data.py +162 -0
src/streamlit_app.py +134 -34

README.md CHANGED Viewed

@@ -11,9 +11,22 @@ pinned: false
 short_description: 'Regroup multiple RSS flux on AI '
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 short_description: 'Regroup multiple RSS flux on AI '
 ---
+# AI News Dashboard
+Cette application web Streamlit agrège et affiche les dernières actualités sur l'intelligence artificielle à partir de multiples flux RSS. Elle permet aux utilisateurs de filtrer les nouvelles par date et par source, et d'ajouter leurs propres flux RSS personnalisés.
+## Fonctionnalités
+- Agrégation de flux RSS de sources d'actualités IA majeures
+- Filtrage par date et par source
+- Ajout de flux RSS personnalisés
+- Mise en cache des données pour des performances optimales
+- Interface utilisateur intuitive
+## Technologies utilisées
+- Streamlit
+- Pandas
+- Feedparser
+- BeautifulSoup
+- Concurrent Futures (pour le traitement parallèle)

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit==1.32.0
+pandas==2.1.0
+feedparser==6.0.10
+bs4==0.0.1
+beautifulsoup4==4.12.2
+concurrent-log-handler==0.9.24
+altair

src/fetch_data.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import feedparser
+import pandas as pd
+from datetime import datetime, timedelta
+import ssl
+from bs4 import BeautifulSoup
+import warnings
+import concurrent.futures
+import re
+if hasattr(ssl, '_create_unverified_context'):
+    ssl._create_default_https_context = ssl._create_unverified_context
+def fetch_single_feed(link_source_tuple):
+    """Fetch a single RSS feed and return its entries"""
+    link, source = link_source_tuple
+    entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": []}
+    try:
+        feed = feedparser.parse(link)
+        for entry in feed.entries:
+            entries["Title"].append(entry.get("title", "No Title"))
+            entries["Link"].append(entry.get("link", "No Link"))
+            entries["Published"].append(entry.get("published", "No Date"))
+            entries["Description"].append(entry.get("description", "No Description"))
+            entries["Source"].append(source)
+    except Exception as e:
+        print(f"Error fetching {link}: {e}")
+    return entries
+def fetch_feed(links):
+    """Fetch multiple RSS feeds in parallel"""
+    all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": []}
+    # Use ThreadPoolExecutor to fetch feeds in parallel
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+        future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source)
+                         for link, source in links.items()}
+        for future in concurrent.futures.as_completed(future_to_link):
+            link, source = future_to_link[future]
+            try:
+                result = future.result()
+                # Merge results into all_entries
+                for key in all_entries:
+                    all_entries[key].extend(result[key])
+            except Exception as e:
+                print(f"Exception for {link}: {e}")
+    # Create a DataFrame from all entries
+    df = pd.DataFrame(all_entries)
+    return df
+def clean_html(text):
+    """Clean HTML tags from text"""
+    try:
+        soup = BeautifulSoup(text, "html.parser")
+        return soup.get_text()
+    except Exception as e:
+        print(f"Error cleaning HTML: {e}")
+        return text
+def extract_date(date_str):
+    """Extract date from various formats using regex patterns"""
+    try:
+        # Try different patterns to match various date formats
+        # Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
+        pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
+        match = re.search(pattern1, date_str)
+        if match:
+            date_str = match.group(1)
+            return pd.to_datetime(date_str, format='%d %b %Y')
+        # Pattern 2: Simple format like "14 Apr 2025"
+        pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
+        match = re.search(pattern2, date_str)
+        if match:
+            return pd.to_datetime(match.group(1), format='%d %b %Y')
+        # Pattern 3: ISO format like "2025-04-14"
+        pattern3 = r'(\d{4}-\d{2}-\d{2})'
+        match = re.search(pattern3, date_str)
+        if match:
+            return pd.to_datetime(match.group(1))
+        # If none of the patterns match, return NaT
+        return pd.NaT
+    except:
+        return pd.NaT
+def extract_and_clean_data(df):
+    """Process and clean the feed data"""
+    if df.empty:
+        return df
+    try:
+        # Apply the custom date extraction function
+        df['date'] = df['Published'].apply(extract_date)
+        # Drop rows with invalid dates
+        df = df.dropna(subset=['date'])
+        # Drop the original 'Published' column
+        df.drop(columns=['Published'], inplace=True)
+        # Filter for the last 7 days
+        today = datetime.now()
+        seven_days_ago = today - timedelta(days=20)
+        df_filtered = df[(df['date'] >= seven_days_ago) & (df['date'] <= today)]
+        # Sort by date in descending order
+        df_filtered = df_filtered.sort_values(by='date', ascending=False)
+        # Clean HTML and limit description length in one step
+        df_filtered['Description'] = df_filtered['Description'].apply(
+            lambda x: clean_html(x)[:500].replace("\n", "")
+        )
+        return df_filtered
+    except Exception as e:
+        print(f"An error occurred while processing the data: {e}")
+        return pd.DataFrame()
+def main():
+    """
+    Fetches and processes RSS feed data from a predefined list of sources.
+    The function defines a dictionary of RSS feed URLs and their corresponding
+    source names. It fetches the RSS feeds using the `fetch_feed` function, then
+    processes and cleans the data using the `extract_and_clean_data` function.
+    The resulting DataFrame, `final_df`, contains cleaned and organized feed data.
+    Returns:
+        pd.DataFrame: A DataFrame containing cleaned and processed RSS feed data.
+    """
+    links = {
+        "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
+        "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
+        "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
+        "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
+        "https://research.facebook.com/feed/": "META Research",
+        "https://openai.com/news/rss.xml": "OpenAI News",
+        "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
+        "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
+        "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
+        "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
+        "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
+        "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_anthropic.xml": "Anthropic News",
+        "https://www.actuia.com/feed/": "ActuIA",
+        "https://news.google.com/rss/search?tbm=nws&q=intelligence+artificielle&oq=intelligence+artificielle&scoring=n&hl=fr&gl=FR&ceid=FR:fr": "Google News - Intelligence Artificielle",
+        "https://www.journaldunet.com/intelligence-artificielle/rss/": "JournalDunet - Intelligence Artificielle",
+        "https://medium.com/feed/tag/AI": "Medium - AI"
+    }
+    df = fetch_feed(links)
+    final_df = extract_and_clean_data(df)
+    return final_df

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,140 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+from datetime import datetime, timedelta
+from fetch_data import main, fetch_feed, extract_and_clean_data
+# Use Streamlit's built-in caching
+@st.cache_data(ttl=60)  # Cache for 1 minute
+def get_data(links):
+    with st.spinner('Fetching latest AI news...'):
+        df = fetch_feed(links)
+        df = extract_and_clean_data(df)
+        return df
+def run_dashboard():
+    st.title("AI News Dashboard")
+    # Liste de base des flux RSS
+    default_links = {
+        "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
+        "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
+        "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
+        "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
+        "https://research.facebook.com/feed/": "META Research",
+        "https://openai.com/news/rss.xml": "OpenAI News",
+        "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
+        "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
+        "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
+        "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
+        "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
+        "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_anthropic.xml": "Anthropic News",
+        "https://www.actuia.com/feed/": "ActuIA",
+        "https://news.google.com/rss/search?tbm=nws&q=intelligence+artificielle&oq=intelligence+artificielle&scoring=n&hl=fr&gl=FR&ceid=FR:fr": "Google News - Intelligence Artificielle",
+        "https://www.journaldunet.com/intelligence-artificielle/rss/": "JournalDunet - Intelligence Artificielle",
+        "https://medium.com/feed/tag/AI": "Medium - AI"
+    }
+    # Saisie utilisateur pour un flux RSS personnalisé
+    st.subheader("Ajouter un flux RSS personnalisé")
+    custom_rss_url = st.text_input("URL du flux RSS (ex: https://www.01net.com/jeux-video/feed/)", key="custom_rss_url")
+    links = default_links.copy()
+    # Gestion de l'ajout dynamique
+    if custom_rss_url:
+        if custom_rss_url not in links:
+            custom_tag = st.text_input("Nom/tag pour ce flux (ex: ActuAI)", key="custom_tag")
+            if custom_tag:
+                links[custom_rss_url] = custom_tag
+                st.success(f"Flux ajouté : {custom_tag}")
+        else:
+            st.info("Ce flux existe déjà dans la liste.")
+    # Add a refresh button
+    if st.button("Refresh Data"):
+        st.cache_data.clear()
+        st.rerun()
+    # Load data with caching
+    try:
+        df = get_data(links)
+        # Check if df is empty
+        if df.empty:
+            st.error("No news data available. Please try refreshing later.")
+            return
+        # Get min and max dates
+        min_date = df['date'].min()
+        max_date = df['date'].max()
+        # Create layout with columns
+        col1, col2 = st.columns(2)
+        with col1:
+            selected_dates = st.date_input(
+                "Choose Date Range",
+                value=(min_date, max_date),
+                min_value=min_date,
+                max_value=max_date
+            )
+            # Handle single date selection
+            if len(selected_dates) == 1:
+                start_date = selected_dates[0]
+                end_date = selected_dates[0]
+            else:
+                start_date, end_date = selected_dates
+        with col2:
+            # Get unique sources
+            all_sources = sorted(df['Source'].unique().tolist())
+            # Add "All" option at the beginning of the list
+            source_options = ["All"] + all_sources
+            # Use multiselect
+            selected_sources = st.multiselect(
+                "Choose one or more sources",
+                options=source_options
+            )
+        # Show button
+        if st.button("Show News", key="show"):
+            if not selected_sources:
+                st.error("Please select at least one source to display news.")
+            else:
+                # Convert dates to datetime
+                start_date = pd.to_datetime(start_date)
+                end_date = pd.to_datetime(end_date)
+                # Filter by date range
+                df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
+                # Handle "All" selection
+                if "All" in selected_sources:
+                    # If "All" is selected, don't filter by source
+                    pass
+                else:
+                    # Filter by selected sources
+                    df_filtered = df_filtered[df_filtered['Source'].isin(selected_sources)]
+                # Display results
+                if len(df_filtered) > 0:
+                    st.success(f"Found {len(df_filtered)} news items")
+                    # Show news as cards
+                    for index, row in df_filtered.iterrows():
+                        st.markdown(f"### [{row['Title']}]({row['Link']})")
+                        st.write(f"**Source**: {row['Source']}")
+                        st.write(f"**Description**: {row['Description']}")
+                        st.write(f"**Date**: {row['date'].strftime('%Y-%m-%d')}")
+                        st.markdown("---")  # Add separator between cards
+                else:
+                    st.warning("No news found with the selected filters. Please adjust your date range or source selection.")
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        st.info("Try refreshing the data using the button above.")
+if __name__ == '__main__':
+    run_dashboard()