tyriaa commited on
Commit
26848d5
·
1 Parent(s): f4e203d

Mise à jour de l'application avec le tableau de bord d'actualités IA

Browse files
Files changed (4) hide show
  1. README.md +17 -4
  2. requirements.txt +7 -3
  3. src/fetch_data.py +162 -0
  4. src/streamlit_app.py +134 -34
README.md CHANGED
@@ -11,9 +11,22 @@ pinned: false
11
  short_description: 'Regroup multiple RSS flux on AI '
12
  ---
13
 
14
- # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: 'Regroup multiple RSS flux on AI '
12
  ---
13
 
14
+ # AI News Dashboard
15
 
16
+ Cette application web Streamlit agrège et affiche les dernières actualités sur l'intelligence artificielle à partir de multiples flux RSS. Elle permet aux utilisateurs de filtrer les nouvelles par date et par source, et d'ajouter leurs propres flux RSS personnalisés.
17
 
18
+ ## Fonctionnalités
19
+
20
+ - Agrégation de flux RSS de sources d'actualités IA majeures
21
+ - Filtrage par date et par source
22
+ - Ajout de flux RSS personnalisés
23
+ - Mise en cache des données pour des performances optimales
24
+ - Interface utilisateur intuitive
25
+
26
+ ## Technologies utilisées
27
+
28
+ - Streamlit
29
+ - Pandas
30
+ - Feedparser
31
+ - BeautifulSoup
32
+ - Concurrent Futures (pour le traitement parallèle)
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ streamlit==1.32.0
2
+ pandas==2.1.0
3
+ feedparser==6.0.10
4
+ bs4==0.0.1
5
+ beautifulsoup4==4.12.2
6
+ concurrent-log-handler==0.9.24
7
+ altair
src/fetch_data.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import pandas as pd
3
+ from datetime import datetime, timedelta
4
+ import ssl
5
+ from bs4 import BeautifulSoup
6
+ import warnings
7
+ import concurrent.futures
8
+ import re
9
+
10
+ if hasattr(ssl, '_create_unverified_context'):
11
+ ssl._create_default_https_context = ssl._create_unverified_context
12
+
13
+ def fetch_single_feed(link_source_tuple):
14
+ """Fetch a single RSS feed and return its entries"""
15
+ link, source = link_source_tuple
16
+ entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": []}
17
+
18
+ try:
19
+ feed = feedparser.parse(link)
20
+
21
+ for entry in feed.entries:
22
+ entries["Title"].append(entry.get("title", "No Title"))
23
+ entries["Link"].append(entry.get("link", "No Link"))
24
+ entries["Published"].append(entry.get("published", "No Date"))
25
+ entries["Description"].append(entry.get("description", "No Description"))
26
+ entries["Source"].append(source)
27
+
28
+ except Exception as e:
29
+ print(f"Error fetching {link}: {e}")
30
+
31
+ return entries
32
+
33
+ def fetch_feed(links):
34
+ """Fetch multiple RSS feeds in parallel"""
35
+ all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": []}
36
+
37
+ # Use ThreadPoolExecutor to fetch feeds in parallel
38
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
39
+ future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source)
40
+ for link, source in links.items()}
41
+
42
+ for future in concurrent.futures.as_completed(future_to_link):
43
+ link, source = future_to_link[future]
44
+ try:
45
+ result = future.result()
46
+ # Merge results into all_entries
47
+ for key in all_entries:
48
+ all_entries[key].extend(result[key])
49
+ except Exception as e:
50
+ print(f"Exception for {link}: {e}")
51
+
52
+ # Create a DataFrame from all entries
53
+ df = pd.DataFrame(all_entries)
54
+ return df
55
+
56
+ def clean_html(text):
57
+ """Clean HTML tags from text"""
58
+ try:
59
+ soup = BeautifulSoup(text, "html.parser")
60
+ return soup.get_text()
61
+ except Exception as e:
62
+ print(f"Error cleaning HTML: {e}")
63
+ return text
64
+
65
+ def extract_date(date_str):
66
+ """Extract date from various formats using regex patterns"""
67
+ try:
68
+ # Try different patterns to match various date formats
69
+
70
+ # Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
71
+ pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
72
+ match = re.search(pattern1, date_str)
73
+ if match:
74
+ date_str = match.group(1)
75
+ return pd.to_datetime(date_str, format='%d %b %Y')
76
+
77
+ # Pattern 2: Simple format like "14 Apr 2025"
78
+ pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
79
+ match = re.search(pattern2, date_str)
80
+ if match:
81
+ return pd.to_datetime(match.group(1), format='%d %b %Y')
82
+
83
+ # Pattern 3: ISO format like "2025-04-14"
84
+ pattern3 = r'(\d{4}-\d{2}-\d{2})'
85
+ match = re.search(pattern3, date_str)
86
+ if match:
87
+ return pd.to_datetime(match.group(1))
88
+
89
+ # If none of the patterns match, return NaT
90
+ return pd.NaT
91
+ except:
92
+ return pd.NaT
93
+
94
+ def extract_and_clean_data(df):
95
+ """Process and clean the feed data"""
96
+ if df.empty:
97
+ return df
98
+
99
+ try:
100
+ # Apply the custom date extraction function
101
+ df['date'] = df['Published'].apply(extract_date)
102
+
103
+ # Drop rows with invalid dates
104
+ df = df.dropna(subset=['date'])
105
+
106
+ # Drop the original 'Published' column
107
+ df.drop(columns=['Published'], inplace=True)
108
+
109
+ # Filter for the last 7 days
110
+ today = datetime.now()
111
+ seven_days_ago = today - timedelta(days=20)
112
+ df_filtered = df[(df['date'] >= seven_days_ago) & (df['date'] <= today)]
113
+
114
+ # Sort by date in descending order
115
+ df_filtered = df_filtered.sort_values(by='date', ascending=False)
116
+
117
+ # Clean HTML and limit description length in one step
118
+ df_filtered['Description'] = df_filtered['Description'].apply(
119
+ lambda x: clean_html(x)[:500].replace("\n", "")
120
+ )
121
+
122
+ return df_filtered
123
+
124
+ except Exception as e:
125
+ print(f"An error occurred while processing the data: {e}")
126
+ return pd.DataFrame()
127
+
128
+ def main():
129
+ """
130
+ Fetches and processes RSS feed data from a predefined list of sources.
131
+
132
+ The function defines a dictionary of RSS feed URLs and their corresponding
133
+ source names. It fetches the RSS feeds using the `fetch_feed` function, then
134
+ processes and cleans the data using the `extract_and_clean_data` function.
135
+ The resulting DataFrame, `final_df`, contains cleaned and organized feed data.
136
+
137
+ Returns:
138
+ pd.DataFrame: A DataFrame containing cleaned and processed RSS feed data.
139
+ """
140
+ links = {
141
+ "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
142
+ "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
143
+ "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
144
+ "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
145
+ "https://research.facebook.com/feed/": "META Research",
146
+ "https://openai.com/news/rss.xml": "OpenAI News",
147
+ "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
148
+ "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
149
+ "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
150
+ "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
151
+ "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
152
+ "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_anthropic.xml": "Anthropic News",
153
+ "https://www.actuia.com/feed/": "ActuIA",
154
+ "https://news.google.com/rss/search?tbm=nws&q=intelligence+artificielle&oq=intelligence+artificielle&scoring=n&hl=fr&gl=FR&ceid=FR:fr": "Google News - Intelligence Artificielle",
155
+ "https://www.journaldunet.com/intelligence-artificielle/rss/": "JournalDunet - Intelligence Artificielle",
156
+ "https://medium.com/feed/tag/AI": "Medium - AI"
157
+ }
158
+
159
+ df = fetch_feed(links)
160
+ final_df = extract_and_clean_data(df)
161
+
162
+ return final_df
src/streamlit_app.py CHANGED
@@ -1,40 +1,140 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
 
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
 
 
 
 
 
 
 
25
 
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ from datetime import datetime, timedelta
4
+ from fetch_data import main, fetch_feed, extract_and_clean_data
5
 
6
+ # Use Streamlit's built-in caching
7
+ @st.cache_data(ttl=60) # Cache for 1 minute
8
+ def get_data(links):
9
+ with st.spinner('Fetching latest AI news...'):
10
+ df = fetch_feed(links)
11
+ df = extract_and_clean_data(df)
12
+ return df
 
 
 
 
 
13
 
14
+ def run_dashboard():
15
+ st.title("AI News Dashboard")
16
+
17
+ # Liste de base des flux RSS
18
+ default_links = {
19
+ "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
20
+ "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
21
+ "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
22
+ "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
23
+ "https://research.facebook.com/feed/": "META Research",
24
+ "https://openai.com/news/rss.xml": "OpenAI News",
25
+ "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
26
+ "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
27
+ "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
28
+ "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
29
+ "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
30
+ "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_anthropic.xml": "Anthropic News",
31
+ "https://www.actuia.com/feed/": "ActuIA",
32
+ "https://news.google.com/rss/search?tbm=nws&q=intelligence+artificielle&oq=intelligence+artificielle&scoring=n&hl=fr&gl=FR&ceid=FR:fr": "Google News - Intelligence Artificielle",
33
+ "https://www.journaldunet.com/intelligence-artificielle/rss/": "JournalDunet - Intelligence Artificielle",
34
+ "https://medium.com/feed/tag/AI": "Medium - AI"
35
+ }
36
+
37
+ # Saisie utilisateur pour un flux RSS personnalisé
38
+ st.subheader("Ajouter un flux RSS personnalisé")
39
+ custom_rss_url = st.text_input("URL du flux RSS (ex: https://www.01net.com/jeux-video/feed/)", key="custom_rss_url")
40
+ links = default_links.copy()
41
 
42
+ # Gestion de l'ajout dynamique
43
+ if custom_rss_url:
44
+ if custom_rss_url not in links:
45
+ custom_tag = st.text_input("Nom/tag pour ce flux (ex: ActuAI)", key="custom_tag")
46
+ if custom_tag:
47
+ links[custom_rss_url] = custom_tag
48
+ st.success(f"Flux ajouté : {custom_tag}")
49
+ else:
50
+ st.info("Ce flux existe déjà dans la liste.")
51
 
52
+ # Add a refresh button
53
+ if st.button("Refresh Data"):
54
+ st.cache_data.clear()
55
+ st.rerun()
56
+
57
+ # Load data with caching
58
+ try:
59
+ df = get_data(links)
60
+
61
+ # Check if df is empty
62
+ if df.empty:
63
+ st.error("No news data available. Please try refreshing later.")
64
+ return
65
+
66
+ # Get min and max dates
67
+ min_date = df['date'].min()
68
+ max_date = df['date'].max()
69
+
70
+ # Create layout with columns
71
+ col1, col2 = st.columns(2)
72
+
73
+ with col1:
74
+ selected_dates = st.date_input(
75
+ "Choose Date Range",
76
+ value=(min_date, max_date),
77
+ min_value=min_date,
78
+ max_value=max_date
79
+ )
80
+
81
+ # Handle single date selection
82
+ if len(selected_dates) == 1:
83
+ start_date = selected_dates[0]
84
+ end_date = selected_dates[0]
85
+ else:
86
+ start_date, end_date = selected_dates
87
+
88
+ with col2:
89
+ # Get unique sources
90
+ all_sources = sorted(df['Source'].unique().tolist())
91
+
92
+ # Add "All" option at the beginning of the list
93
+ source_options = ["All"] + all_sources
94
+
95
+ # Use multiselect
96
+ selected_sources = st.multiselect(
97
+ "Choose one or more sources",
98
+ options=source_options
99
+ )
100
+
101
+ # Show button
102
+ if st.button("Show News", key="show"):
103
+ if not selected_sources:
104
+ st.error("Please select at least one source to display news.")
105
+ else:
106
+ # Convert dates to datetime
107
+ start_date = pd.to_datetime(start_date)
108
+ end_date = pd.to_datetime(end_date)
109
+
110
+ # Filter by date range
111
+ df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
112
+
113
+ # Handle "All" selection
114
+ if "All" in selected_sources:
115
+ # If "All" is selected, don't filter by source
116
+ pass
117
+ else:
118
+ # Filter by selected sources
119
+ df_filtered = df_filtered[df_filtered['Source'].isin(selected_sources)]
120
+
121
+ # Display results
122
+ if len(df_filtered) > 0:
123
+ st.success(f"Found {len(df_filtered)} news items")
124
+
125
+ # Show news as cards
126
+ for index, row in df_filtered.iterrows():
127
+ st.markdown(f"### [{row['Title']}]({row['Link']})")
128
+ st.write(f"**Source**: {row['Source']}")
129
+ st.write(f"**Description**: {row['Description']}")
130
+ st.write(f"**Date**: {row['date'].strftime('%Y-%m-%d')}")
131
+ st.markdown("---") # Add separator between cards
132
+ else:
133
+ st.warning("No news found with the selected filters. Please adjust your date range or source selection.")
134
+
135
+ except Exception as e:
136
+ st.error(f"An error occurred: {str(e)}")
137
+ st.info("Try refreshing the data using the button above.")
138
 
139
+ if __name__ == '__main__':
140
+ run_dashboard()