File size: 3,374 Bytes
f63fa31
 
 
 
 
f827315
 
 
 
 
f63fa31
715921b
f63fa31
 
715921b
f63fa31
 
715921b
f63fa31
 
715921b
f63fa31
 
 
 
 
715921b
f63fa31
de78f0e
715921b
de78f0e
 
715921b
de78f0e
715921b
8091043
 
8179b58
715921b
 
 
 
 
8091043
 
 
8179b58
8091043
 
715921b
8091043
de78f0e
 
715921b
f63fa31
 
 
715921b
 
 
 
 
 
 
f63fa31
86fe81e
f63fa31
 
de78f0e
715921b
 
 
 
 
 
 
 
 
 
de78f0e
 
715921b
 
f63fa31
715921b
 
f63fa31
715921b
fdfda12
 
715921b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import feedparser
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
LOCAL_DB_DIR = "chroma_db"
RSS_FEEDS = [
    "https://www.nasa.gov/rss/dyn/breaking_news.rss",
    "https://www.sciencedaily.com/rss/top/science.xml",
    "https://www.wired.com/feed/rss",
    # Add more feeds as needed; starting with reliable ones
]

# Initialize embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)

def fetch_rss_feeds():
    articles = []
    seen_keys = set()
    for feed_url in RSS_FEEDS:
        try:
            logger.info(f"Fetching {feed_url}")
            feed = feedparser.parse(feed_url)
            if feed.bozo:
                logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
                continue
            for entry in feed.entries:
                title = entry.get("title", "No Title")
                link = entry.get("link", "")
                description = entry.get("summary", entry.get("description", "No Description"))
                key = f"{title}|{link}"
                if key not in seen_keys:
                    seen_keys.add(key)
                    image = (entry.get("media_content", [{}])[0].get("url") or
                             entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
                    articles.append({
                        "title": title,
                        "link": link,
                        "description": description,
                        "published": entry.get("published", "Unknown Date"),
                        "category": categorize_feed(feed_url),
                        "image": image,
                    })
        except Exception as e:
            logger.error(f"Error fetching {feed_url}: {e}")
    logger.info(f"Total articles fetched: {len(articles)}")
    return articles

def categorize_feed(url):
    if "sciencedaily" in url:
        return "Science"
    elif "nasa" in url:
        return "Space"
    elif "wired" in url:
        return "Tech"
    return "Uncategorized"

def process_and_store_articles(articles):
    documents = []
    for article in articles:
        try:
            metadata = {
                "title": article["title"],
                "link": article["link"],
                "original_description": article["description"],
                "published": article["published"],
                "category": article["category"],
                "image": article["image"],
            }
            doc = Document(page_content=article["description"], metadata=metadata)
            documents.append(doc)
        except Exception as e:
            logger.error(f"Error processing article {article['title']}: {e}")
    
    if documents:
        try:
            vector_db.add_documents(documents)
            logger.info(f"Stored {len(documents)} articles in DB")
        except Exception as e:
            logger.error(f"Error storing articles: {e}")

if __name__ == "__main__":
    articles = fetch_rss_feeds()
    process_and_store_articles(articles)