File size: 4,738 Bytes
8366946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Handles scraping and preprocessing logic before OpenAI interaction."""

import time
from typing import Dict, List, Self

import feedparser
import requests
from bs4 import BeautifulSoup

from src.config.constants import MAX_DEALS_PER_FEED
from src.config.feeds import CATEGORY_FEEDS
from src.utils.logger import console


def extract(html_snippet: str) -> str:
    """Cleans text from messy HTML with fallback handling."""
    soup = BeautifulSoup(html_snippet, "html.parser")
    snippet = soup.find("div", class_="snippet summary")

    # Extract inner HTML or fallback to full snippet
    raw_html = snippet.decode_contents() if snippet else html_snippet

    # Parse again to clean any nested/malformed HTML
    clean_soup = BeautifulSoup(raw_html, "html.parser")
    text = clean_soup.get_text(" ", strip=True)

    return text.replace("\n", " ")


class ScrapedDeal:
    """Represents a deal from an RSS feed.

    Flow: fetch() → __init__ → _load_content() → use methods.
    """

    category: str  # Deal type
    title: str  # Deal title
    summary: str  # RSS summary
    url: str  # Deal link
    details: str  # Full description
    features: str  # Feature list

    def __init__(self, entry: Dict[str, str]) -> None:
        """Initialize deal from RSS entry and fetch content."""
        # Basic metadata from RSS
        self.title = entry["title"]
        self.summary = extract(entry["summary"])
        self.url = entry["links"][0]["href"]

        # Initialize placeholders
        self.details = ""
        self.features = ""

        # Fetch and parse full deal content
        self._load_content()

    def _load_content(self) -> None:
        """Fetches and parses deal content; raises on failure to skip."""
        try:
            res = requests.get(self.url, timeout=5)
            res.raise_for_status()

            soup = BeautifulSoup(res.content, "html.parser")
            content = soup.find("div", class_="content-section")

            if content:
                text = content.get_text().replace("\nmore", "").replace("\n", " ")
                if "Features" in text:
                    self.details, self.features = text.split("Features", 1)
                else:
                    self.details = text
                    self.features = ""
            else:
                raise ValueError("No content section found.")

        except Exception as e:
            raise RuntimeError(f"Failed to load deal content from {self.url}: {e}")

    def __repr__(self) -> str:
        """Quick string representation of the deal."""
        return f"<{self.title}>"

    def describe(self) -> str:
        """Detailed description of the deal."""
        return (
            f"Title: {self.title.strip()}\n"
            f"Details: {self.details.strip()}\n"
            f"Features: {self.features.strip()}\n"
            f"URL: {self.url.strip()}"
        )

    @classmethod
    def fetch(cls, selected_categories: List[str]) -> List[Self]:
        """Parses RSS feeds into ScrapedDeal instances.

        Skips failed deals; stops app if all fail.
        """
        deals = []
        feed_urls = [
            CATEGORY_FEEDS[cat] for cat in selected_categories if cat in CATEGORY_FEEDS
        ]

        for feed_url in feed_urls:
            feed = cls._parse_feed(feed_url)
            if feed is None:
                continue

            console.print(
                f"[bold blue]DEBUG[/] {len(feed.entries)} entries found in feed: "
                f"{feed_url}"
            )

            for entry in feed.entries[:MAX_DEALS_PER_FEED]:
                cls._process_deal(entry, deals)

            # Throttle requests to avoid hitting servers too fast
            time.sleep(0.5)

        if not deals:
            raise RuntimeError("❌ All deals failed to load. Stopping.")

        return deals

    @staticmethod
    def _parse_feed(feed_url: str) -> feedparser.FeedParserDict | None:
        """Helper method to parse the RSS feed and return the feed data."""
        feed = feedparser.parse(feed_url)
        if feed.bozo:
            console.print(
                f"[bold red]ERROR[/] Failed to parse RSS feed: {feed_url} "
                f"({feed.bozo_exception})"
            )
            return None
        return feed

    @staticmethod
    def _process_deal(entry: Dict[str, str], deals: List[Self]) -> None:
        """Helper method to process each RSS entry and add valid deals."""
        try:
            deal = ScrapedDeal(entry)
            deals.append(deal)
        except Exception as e:
            console.print(
                f"[bold yellow]WARN[/] Skipped deal "
                f"'{entry.get('title', 'Unknown')}' due to error: {e}"
            )