Spaces:

tamirgz
/

Phidata

Running

File size: 49,718 Bytes

1be3350

import json
import re
import time
import os
import concurrent.futures
from typing import Optional, Iterator, List, Set, Dict, Any
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from datetime import datetime

# Phi imports
from phi.workflow import Workflow, RunResponse, RunEvent
from phi.storage.workflow.sqlite import SqlWorkflowStorage
from phi.agent import Agent
from phi.model.groq import Groq  
from phi.tools.duckduckgo import DuckDuckGo
from phi.tools.googlesearch import GoogleSearch
from phi.utils.pprint import pprint_run_response
from phi.utils.log import logger

# Error handling imports
from duckduckgo_search.exceptions import RatelimitException
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
from requests.exceptions import HTTPError

from config import GROQ_API_KEY, NVIDIA_API_KEY, SEARCHER_MODEL_CONFIG, WRITER_MODEL_CONFIG, get_hf_model
import configparser

DUCK_DUCK_GO_FIXED_MAX_RESULTS = 10

config = configparser.ConfigParser()
config.read('config.ini')
DEFAULT_TOPIC = config.get('DEFAULT', 'default_topic')
INITIAL_WEBSITES = config.get('DEFAULT', 'initial_websites')

# The topic to generate a blog post on
topic = DEFAULT_TOPIC

class NewsArticle(BaseModel):
    """Article data model containing title, URL and description."""
    title: str = Field(..., description="Title of the article.")
    url: str = Field(..., description="Link to the article.")
    description: Optional[str] = Field(None, description="Summary of the article if available.")


class SearchResults(BaseModel):
    """Container for search results containing a list of articles."""
    articles: List[NewsArticle]


class BlogPostGenerator(Workflow):
    """Workflow for generating blog posts based on web research."""
    searcher: Agent = Field(...)
    backup_searcher: Agent = Field(...)
    writer: Agent = Field(...)
    initial_websites: List[str] = Field(default_factory=lambda: INITIAL_WEBSITES)
    file_handler: Optional[Any] = Field(None)

    def __init__(
        self,
        session_id: str,
        searcher: Agent,
        backup_searcher: Agent,
        writer: Agent,
        file_handler: Optional[Any] = None,
        storage: Optional[SqlWorkflowStorage] = None,
    ):
        super().__init__(
            session_id=session_id,
            searcher=searcher,
            backup_searcher=backup_searcher,
            writer=writer,
            storage=storage,
        )
        self.file_handler = file_handler
        
        # Configure search instructions
        search_instructions = [
            "Given a topic, search for 20 articles and return the 15 most relevant articles.",
            "For each article, provide:",
            "- title: The article title",
            "- url: The article URL",
            "- description: A brief description or summary of the article",
            "Return the results in a structured format with these exact field names."
        ]
        
        # Primary searcher using DuckDuckGo
        self.searcher = Agent(
            model=get_hf_model('searcher'),
            tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)],
            instructions=search_instructions,
            response_model=SearchResults
        )

        
        # Backup searcher using Google Search
        self.backup_searcher = Agent(
            model=get_hf_model('searcher'),
            tools=[GoogleSearch()],
            instructions=search_instructions,
            response_model=SearchResults
        )


        # Writer agent configuration
        writer_instructions = [
            "You are a professional research analyst tasked with creating a comprehensive report on the given topic.",
            "The sources provided include both general web search results and specialized intelligence/security websites.",
            "Carefully analyze and cross-reference information from all sources to create a detailed report.",
            "",
            "Report Structure:",
            "1. Executive Summary (2-3 paragraphs)",
            "   - Provide a clear, concise overview of the main findings",
            "   - Address the research question directly",
            "   - Highlight key discoveries and implications",
            "",
            "2. Detailed Analysis (Multiple sections)",
            "   - Break down the topic into relevant themes or aspects",
            "   - For each theme:",
            "     * Present detailed findings from multiple sources",
            "     * Cross-reference information between general and specialized sources",
            "     * Analyze trends, patterns, and developments",
            "     * Discuss implications and potential impacts",
            "",
            "3. Source Analysis and Credibility",
            "   For each major source:",
            "   - Evaluate source credibility and expertise",
            "   - Note if from specialized intelligence/security website",
            "   - Assess potential biases or limitations",
            "   - Key findings and unique contributions",
            "",
            "4. Key Takeaways and Strategic Implications",
            "   - Synthesize findings from all sources",
            "   - Compare/contrast general media vs specialized analysis",
            "   - Discuss broader geopolitical implications",
            "   - Address potential future developments",
            "",
            "5. References",
            "   - Group sources by type (specialized websites vs general media)",
            "   - List all sources with full citations",
            "   - Include URLs as clickable markdown links [Title](URL)",
            "   - Ensure every major claim has at least one linked source",
            "",
            "Important Guidelines:",
            "- Prioritize information from specialized intelligence/security sources",
            "- Cross-validate claims between multiple sources when possible",
            "- Maintain a professional, analytical tone",
            "- Support all claims with evidence",
            "- Include specific examples and data points",
            "- Use direct quotes for significant statements",
            "- Address potential biases in reporting",
            "- Ensure the report directly answers the research question",
            "",
            "Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.",
            "Each major section should contain multiple paragraphs with detailed analysis."
        ]
        
        self.writer = Agent(
            model=get_hf_model('writer'),
            instructions=writer_instructions,
            structured_outputs=True
        )


    def _parse_search_response(self, response) -> Optional[SearchResults]:
        """Parse and validate search response into SearchResults model."""
        try:
            if isinstance(response, str):
                # Clean up markdown code blocks and extract JSON
                content = response.strip()
                if '```' in content:
                    # Extract content between code block markers
                    match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL)
                    if match:
                        content = match.group(1).strip()
                    else:
                        # If no proper code block found, remove all ``` markers
                        content = re.sub(r'```(?:json)?\n?', '', content)
                        content = content.strip()
                
                # Try to parse JSON response
                try:
                    # Clean up any trailing commas before closing brackets/braces
                    content = re.sub(r',(\s*[}\]])', r'\1', content)
                    # Fix invalid escape sequences
                    content = re.sub(r'\\([^"\\\/bfnrtu])', r'\1', content)  # Remove invalid escapes
                    content = content.replace('\t', ' ')  # Replace tabs with spaces
                    # Handle any remaining unicode escapes
                    content = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), content)
                    
                    data = json.loads(content)
                    
                    if isinstance(data, dict) and 'articles' in data:
                        articles = []
                        for article in data['articles']:
                            if isinstance(article, dict):
                                # Ensure all required fields are strings
                                article = {
                                    'title': str(article.get('title', '')).strip(),
                                    'url': str(article.get('url', '')).strip(),
                                    'description': str(article.get('description', '')).strip()
                                }
                                if article['title'] and article['url']:  # Only add if has required fields
                                    articles.append(NewsArticle(**article))
                        
                        if articles:
                            logger.info(f"Successfully parsed {len(articles)} articles from JSON")
                            return SearchResults(articles=articles)
                        
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse JSON response: {str(e)}, attempting to extract data manually")
                    
                # Fallback to regex extraction if JSON parsing fails
                urls = re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', content)
                titles = re.findall(r'"title":\s*"([^"]+)"', content)
                descriptions = re.findall(r'"description":\s*"([^"]+)"', content)
                
                if not urls:  # Try alternative patterns
                    urls = re.findall(r'(?<=\()http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+(?=\))', content)
                
                if urls:
                    articles = []
                    for i, url in enumerate(urls):
                        title = titles[i] if i < len(titles) else f"Article {i+1}"
                        description = descriptions[i] if i < len(descriptions) else ""
                        # Clean up extracted data
                        title = title.strip().replace('\\"', '"')
                        url = url.strip().replace('\\"', '"')
                        description = description.strip().replace('\\"', '"')
                        
                        if url:  # Only add if URL exists
                            articles.append(NewsArticle(
                                title=title,
                                url=url,
                                description=description
                            ))
                    
                    if articles:
                        logger.info(f"Successfully extracted {len(articles)} articles using regex")
                        return SearchResults(articles=articles)
                    
                logger.warning("No valid articles found in response")
                return None
                
            elif isinstance(response, dict):
                # Handle dictionary response
                if 'articles' in response:
                    articles = []
                    for article in response['articles']:
                        if isinstance(article, dict):
                            # Ensure all fields are strings
                            article = {
                                'title': str(article.get('title', '')).strip(),
                                'url': str(article.get('url', '')).strip(),
                                'description': str(article.get('description', '')).strip()
                            }
                            if article['title'] and article['url']:
                                articles.append(NewsArticle(**article))
                        elif isinstance(article, NewsArticle):
                            articles.append(article)
                    
                    if articles:
                        logger.info(f"Successfully processed {len(articles)} articles from dict")
                        return SearchResults(articles=articles)
                return None
                
            elif isinstance(response, SearchResults):
                # Already in correct format
                return response
                
            elif isinstance(response, RunResponse):
                # Extract from RunResponse
                if response.content:
                    return self._parse_search_response(response.content)
                return None
                
            logger.error(f"Unsupported response type: {type(response)}")
            return None
            
        except Exception as e:
            logger.error(f"Error parsing search response: {str(e)}")
            return None

    def _search_with_retry(self, topic: str, use_backup: bool = False, max_retries: int = 3) -> Optional[SearchResults]:
        """Execute search with retries and rate limit handling."""
        searcher = self.backup_searcher if use_backup else self.searcher
        source = "backup" if use_backup else "primary"
        
        # Initialize rate limit tracking
        rate_limited_sources = set()
        
        for attempt in range(max_retries):
            try:
                if source in rate_limited_sources:
                    logger.warning(f"{source} search is rate limited, switching to alternative method")
                    if not use_backup:
                        # Try backup search if primary is rate limited
                        backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries)
                        if backup_results:
                            return backup_results
                    # If both sources are rate limited, use longer backoff
                    backoff_time = min(3600, 60 * (2 ** attempt))  # Max 1 hour backoff
                    logger.info(f"All search methods rate limited. Waiting {backoff_time} seconds before retry...")
                    time.sleep(backoff_time)
                
                logger.info(f"\nAttempting {source} search (attempt {attempt + 1}/{max_retries})...")
                
                # Try different search prompts to improve results
                search_prompts = [
                    f"""Search for detailed articles about: {topic}
                    Return only high-quality, relevant sources.
                    Format the results as a JSON object with an 'articles' array containing:
                    - title: The article title
                    - url: The article URL
                    - description: A brief description or summary
                    """,
                    f"""Find comprehensive articles and research papers about: {topic}
                    Focus on authoritative sources and recent publications.
                    Return results in JSON format with 'articles' array.
                    """,
                    f"""Locate detailed analysis and reports discussing: {topic}
                    Prioritize academic, industry, and news sources.
                    Return structured JSON with article details.
                    """
                ]
                
                # Try each prompt until we get results
                for prompt in search_prompts:
                    try:
                        response = searcher.run(prompt, stream=False)
                        results = self._parse_search_response(response)
                        if results and results.articles:
                            logger.info(f"Found {len(results.articles)} articles from {source} search")
                            return results
                    except Exception as e:
                        if any(err in str(e).lower() for err in ["rate", "limit", "quota", "exhausted"]):
                            rate_limited_sources.add(source)
                            raise
                        logger.warning(f"Search prompt failed: {str(e)}")
                        continue
                
                logger.warning(f"{source.title()} search returned no valid results")
                
            except Exception as e:
                error_msg = str(e).lower()
                if any(err in error_msg for err in ["rate", "limit", "quota", "exhausted"]):
                    rate_limited_sources.add(source)
                    logger.error(f"{source} search rate limited: {str(e)}")
                    # Try alternative source immediately
                    if not use_backup:
                        backup_results = self._search_with_retry(topic, use_backup=True, max_retries=max_retries)
                        if backup_results:
                            return backup_results
                else:
                    logger.error(f"Error during {source} search (attempt {attempt + 1}): {str(e)}")
                
                if attempt < max_retries - 1:
                    backoff_time = 2 ** attempt
                    if source in rate_limited_sources:
                        backoff_time = min(3600, 60 * (2 ** attempt))  # Longer backoff for rate limits
                    logger.info(f"Waiting {backoff_time} seconds before retry...")
                    time.sleep(backoff_time)
        
        return None

    def _validate_content(self, content: str) -> bool:
        """Validate that the generated content is readable and properly formatted."""
        if not content or len(content.strip()) < 100:
            logger.warning("Content too short or empty")
            return False
            
        # Check for basic structure
        if not any(marker in content for marker in ['#', '\n\n']):
            logger.warning("Content lacks proper structure (headers or paragraphs)")
            return False
            
        # Check for reasonable paragraph lengths
        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
        if not paragraphs:
            logger.warning("No valid paragraphs found")
            return False
            
        # Common words that are allowed to repeat frequently
        common_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
            'this', 'that', 'these', 'those', 'it', 'its', 'is', 'are', 'was', 'were', 'be', 'been',
            'has', 'have', 'had', 'would', 'could', 'should', 'will', 'can'
        }
        
        # Track word frequencies across paragraphs
        word_frequencies = {}
        total_words = 0
        
        # Validate each paragraph
        for para in paragraphs:
            # Skip headers and references
            if para.startswith('#') or para.startswith('http'):
                continue
                
            # Calculate word statistics
            words = para.split()
            if len(words) < 3:
                continue  # Skip very short paragraphs
                
            # Calculate word statistics
            word_lengths = [len(word) for word in words]
            avg_word_length = sum(word_lengths) / len(word_lengths)
            
            # More nuanced word length validation
            long_words = [w for w in words if len(w) > 15]
            long_word_ratio = len(long_words) / len(words) if words else 0
            
            # Allow higher average length if the text contains URLs or technical terms
            contains_url = any(word.startswith(('http', 'www')) for word in words)
            contains_technical = any(word.lower().endswith(('tion', 'ment', 'ology', 'ware', 'tech')) for word in words)
            
            # Adjust thresholds based on content type
            max_avg_length = 12  # Base maximum average word length
            if contains_url:
                max_avg_length = 20  # Allow longer average for content with URLs
            elif contains_technical:
                max_avg_length = 15  # Allow longer average for technical content
            
            # Fail only if multiple indicators of problematic text
            if (avg_word_length > max_avg_length and long_word_ratio > 0.3) or avg_word_length > 25:
                logger.warning(f"Suspicious word lengths: avg={avg_word_length:.1f}, long_ratio={long_word_ratio:.1%}")
                return False
            
            # Check for excessive punctuation or special characters
            special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\s.,!?()"-]', para)) / len(para)
            if special_char_ratio > 0.15:  # Increased threshold slightly
                logger.warning(f"Too many special characters: {special_char_ratio}")
                return False
                
            # Check for coherent sentence structure
            sentences = [s.strip() for s in re.split(r'[.!?]+', para) if s.strip()]
            weak_sentences = 0
            for sentence in sentences:
                words = sentence.split()
                if len(words) < 3:  # Skip very short sentences
                    continue
                    
                # More lenient grammar check
                structure_indicators = [
                    any(word[0].isupper() for word in words),  # Has some capitalization
                    any(word.lower() in common_words for word in words),  # Has common words
                    len(words) >= 3,  # Reasonable length
                    any(len(word) > 3 for word in words),  # Has some non-trivial words
                ]
                
                # Only fail if less than 2 indicators are present
                if sum(structure_indicators) < 2:
                    logger.warning(f"Weak sentence structure: {sentence}")
                    weak_sentences += 1
                    if weak_sentences > len(sentences) / 2:  # Fail if more than half are weak
                        logger.warning("Too many poorly structured sentences")
                        return False
                
                # Update word frequencies
                for word in words:
                    word = word.lower()
                    if word not in common_words and len(word) > 2:  # Only track non-common words
                        word_frequencies[word] = word_frequencies.get(word, 0) + 1
                        total_words += 1
        
        # Check for excessive repetition
        if total_words > 0:
            for word, count in word_frequencies.items():
                # Calculate the frequency as a percentage
                frequency = count / total_words
                
                # Allow up to 10% frequency for any word
                if frequency > 0.1 and count > 3:
                    logger.warning(f"Word '{word}' appears too frequently ({count} times, {frequency:.1%})")
                    return False
        
        # Content seems valid
        return True

    def _save_markdown(self, topic: str, content: str) -> str:
        """Save the content as an HTML file."""
        try:
            # Get or create report directory
            report_dir = None
            if hasattr(self, 'file_handler') and self.file_handler:
                report_dir = self.file_handler.report_dir
            else:
                # Create a default report directory if no file handler
                report_dir = os.path.join(os.path.dirname(__file__), f"report_{datetime.now().strftime('%Y-%m-%d')}")
                os.makedirs(report_dir, exist_ok=True)
                logger.info(f"Created report directory: {report_dir}")
            
            # Create filename from topic
            filename = re.sub(r'[^\w\s-]', '', topic.lower())  # Remove special chars
            filename = re.sub(r'[-\s]+', '-', filename)        # Replace spaces with hyphens
            filename = f"{filename}.html"
            file_path = os.path.join(report_dir, filename)
            
            # Convert markdown to HTML with styling
            html_content = f"""
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1.0">
                <title>{topic}</title>
                <style>
                    body {{
                        font-family: Arial, sans-serif;
                        line-height: 1.6;
                        color: #333;
                        max-width: 1200px;
                        margin: 0 auto;
                        padding: 20px;
                    }}
                    h1 {{
                        color: #2c3e50;
                        border-bottom: 2px solid #3498db;
                        padding-bottom: 10px;
                    }}
                    h2 {{
                        color: #34495e;
                        margin-top: 30px;
                    }}
                    h3 {{
                        color: #455a64;
                    }}
                    a {{
                        color: #3498db;
                        text-decoration: none;
                    }}
                    a:hover {{
                        text-decoration: underline;
                    }}
                    .executive-summary {{
                        background-color: #f8f9fa;
                        border-left: 4px solid #3498db;
                        padding: 20px;
                        margin: 20px 0;
                    }}
                    .analysis-section {{
                        margin: 30px 0;
                    }}
                    .source-section {{
                        background-color: #f8f9fa;
                        padding: 15px;
                        margin: 10px 0;
                        border-radius: 5px;
                    }}
                    .references {{
                        margin-top: 40px;
                        border-top: 2px solid #ecf0f1;
                        padding-top: 20px;
                    }}
                    .timestamp {{
                        color: #7f8c8d;
                        font-size: 0.9em;
                        margin-top: 40px;
                        text-align: right;
                    }}
                    blockquote {{
                        border-left: 3px solid #3498db;
                        margin: 20px 0;
                        padding-left: 20px;
                        color: #555;
                    }}
                    code {{
                        background-color: #f7f9fa;
                        padding: 2px 5px;
                        border-radius: 3px;
                        font-family: monospace;
                    }}
                </style>
            </head>
            <body>
                <div class="content">
                    {self._markdown_to_html(content)}
                </div>
                <div class="timestamp">
                    Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
                </div>
            </body>
            </html>
            """
            
            # Write the HTML file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(html_content)
            
            logger.info(f"Successfully saved HTML report: {file_path}")
            return file_path
            
        except Exception as e:
            logger.error(f"Failed to save HTML file: {str(e)}")
            return None
    
    def _markdown_to_html(self, markdown_content: str) -> str:
        """Convert markdown content to HTML with basic formatting."""
        # Headers
        html = markdown_content
        html = re.sub(r'^# (.*?)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)
        html = re.sub(r'^## (.*?)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)
        html = re.sub(r'^### (.*?)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)
        
        # Lists
        html = re.sub(r'^\* (.*?)$', r'<li>\1</li>', html, flags=re.MULTILINE)
        html = re.sub(r'(<li>.*?</li>\n)+', r'<ul>\n\g<0></ul>', html, flags=re.DOTALL)
        
        # Links
        html = re.sub(r'\[(.*?)\]\((.*?)\)', r'<a href="\2">\1</a>', html)
        
        # Emphasis
        html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', html)
        html = re.sub(r'\*(.*?)\*', r'<em>\1</em>', html)
        
        # Paragraphs
        html = re.sub(r'\n\n(.*?)\n\n', r'\n<p>\1</p>\n', html, flags=re.DOTALL)
        
        # Blockquotes
        html = re.sub(r'^\> (.*?)$', r'<blockquote>\1</blockquote>', html, flags=re.MULTILINE)
        
        # Code blocks
        html = re.sub(r'```(.*?)```', r'<pre><code>\1</code></pre>', html, flags=re.DOTALL)
        html = re.sub(r'`(.*?)`', r'<code>\1</code>', html)
        
        return html

    def run(self, topic: str, use_cache: bool = True) -> Iterator[RunResponse]:
        """Run the blog post generation workflow."""
        logger.info(f"Starting blog post generation for topic: {topic}")
        
        # Extract keywords from topic
        keywords = topic.lower().split()
        keywords = [w for w in keywords if len(w) > 3 and w not in {'what', 'where', 'when', 'how', 'why', 'is', 'are', 'was', 'were', 'will', 'would', 'could', 'should', 'the', 'and', 'but', 'or', 'for', 'with'}]
        
        all_articles = []
        existing_urls = set()
        
        # First, try web search
        logger.info("Starting web search...")
        search_results = self._search_with_retry(topic)
        if search_results and search_results.articles:
            for article in search_results.articles:
                if article.url not in existing_urls:
                    all_articles.append(article)
                    existing_urls.add(article.url)
            logger.info(f"Found {len(search_results.articles)} articles from web search")
        
        # Then, crawl initial websites
        logger.info("Starting website crawl...")
        from file_handler import FileHandler
        crawler = WebsiteCrawler(max_pages_per_site=10)
        crawler.file_handler = FileHandler()  # Initialize file handler
        
        # Get the report directory from the file handler
        report_dir = crawler.file_handler.report_dir
        
        crawled_results = crawler.crawl_all_websites(self.initial_websites, keywords)
        
        # Save the relevance log to the report directory
        crawler.save_relevance_log(report_dir)
        
        if crawled_results:
            for result in crawled_results:
                if result['url'] not in existing_urls:
                    article = NewsArticle(**result)
                    all_articles.append(article)
                    existing_urls.add(result['url'])
            logger.info(f"Found {len(crawled_results)} articles from website crawl")
        
        # If we still need more results, try backup search
        if len(all_articles) < 10:
            logger.info("Supplementing with backup search...")
            backup_results = self._search_with_retry(topic, use_backup=True)
            if backup_results and backup_results.articles:
                for article in backup_results.articles:
                    if article.url not in existing_urls:
                        all_articles.append(article)
                        existing_urls.add(article.url)
                logger.info(f"Found {len(backup_results.articles)} articles from backup search")
        
        # Create final search results
        search_results = SearchResults(articles=all_articles)
        
        if len(search_results.articles) < 5:  # Reduced minimum requirement
            error_msg = f"Failed to gather sufficient sources. Only found {len(search_results.articles)} valid sources."
            logger.error(error_msg)
            yield RunResponse(
                event=RunEvent.run_completed,
                message=error_msg
            )
            return
        
        logger.info(f"Successfully gathered {len(search_results.articles)} unique sources for analysis")
        
        # Writing phase
        print("\nGenerating report from search results...")
        writer_response = self.writer.run(
            f"""Generate a comprehensive research report on: {topic}
            Use the following articles as sources:
            {json.dumps([{'title': a.title, 'url': a.url, 'description': a.description} for a in search_results.articles], indent=2)}
            
            Format the output in markdown with:
            1. Clear section headers using #, ##, ###
            2. Proper paragraph spacing
            3. Bullet points where appropriate
            4. Links to sources
            5. A references section at the end
            
            Focus on readability and proper markdown formatting.""",
            stream=False
        )
        
        if isinstance(writer_response, RunResponse):
            content = writer_response.content
        else:
            content = writer_response

        # Validate content
        if not self._validate_content(content):
            print("\nFirst attempt produced invalid content, trying again...")
            # Try one more time with a more structured prompt
            writer_response = self.writer.run(
                f"""Generate a clear, well-structured research report on: {topic}
                Format the output in proper markdown with:
                1. A main title using # 
                2. Section headers using ##
                3. Subsection headers using ###
                4. Well-formatted paragraphs
                5. Bullet points for lists
                6. A references section at the end
                
                Source articles:
                {json.dumps([{'title': a.title, 'url': a.url} for a in search_results.articles], indent=2)}""",
                stream=False
            )
            
            if isinstance(writer_response, RunResponse):
                content = writer_response.content
            else:
                content = writer_response
            
            if not self._validate_content(content):
                yield RunResponse(
                    event=RunEvent.run_completed,
                    message="Failed to generate readable content. Please try again."
                )
                return

        # Save as HTML
        html_file = self._save_markdown(topic, content)
        
        if not html_file:
            yield RunResponse(
                event=RunEvent.run_completed,
                message="Failed to save HTML file. Please try again."
            )
            return
        
        # Print the report to console and yield response
        print("\n=== Generated Report ===\n")
        print(content)
        print("\n=====================\n")
        
        yield RunResponse(
            event=RunEvent.run_completed,
            message=f"Report generated successfully. HTML saved as: {html_file}",
            content=content
        )
        
        return

class WebsiteCrawler:
    """Crawler to extract relevant information from specified websites."""
    
    def __init__(self, max_pages_per_site: int = 10):
        self.max_pages_per_site = max_pages_per_site
        self.visited_urls: Set[str] = set()
        self.results: Dict[str, List[dict]] = {}
        self.file_handler = None
        
        # Set up logging
        self.relevance_log = []  # Store relevance decisions
    
    def _check_relevance(self, text: str, keywords: List[str]) -> tuple[bool, dict]:
        """
        Check if the page content is relevant based on keywords.
        Returns a tuple of (is_relevant, relevance_info).
        """
        text_lower = text.lower()
        keyword_matches = {}
        
        # Check each keyword and count occurrences
        for keyword in keywords:
            keyword_lower = keyword.lower()
            count = text_lower.count(keyword_lower)
            keyword_matches[keyword] = count
        
        # Page is relevant if any keyword is found
        is_relevant = any(count > 0 for count in keyword_matches.values())
        
        # Prepare relevance information
        relevance_info = {
            'is_relevant': is_relevant,
            'keyword_matches': keyword_matches,
            'total_matches': sum(keyword_matches.values()),
            'matching_keywords': [k for k, v in keyword_matches.items() if v > 0],
            'text_length': len(text)
        }
        
        return is_relevant, relevance_info

    def crawl_page(self, url: str, keywords: List[str]) -> List[dict]:
        """Crawl a single page and extract relevant information."""
        try:
            # Skip if already visited
            if url in self.visited_urls:
                logger.debug(f"Skipping already visited URL: {url}")
                return []
            
            self.visited_urls.add(url)
            logger.info(f"Crawling page: {url}")
            
            # Fetch and parse the page
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Get page title
            title = soup.title.string if soup.title else url
            
            # Extract text content
            text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
            
            # Check relevance and get detailed information
            is_relevant, relevance_info = self._check_relevance(text, keywords)
            
            # Log relevance decision
            log_entry = {
                'url': url,
                'title': title,
                'timestamp': datetime.now().isoformat(),
                'relevance_info': relevance_info
            }
            self.relevance_log.append(log_entry)
            
            # Log the decision with details
            if is_relevant:
              logger.info(
                    f"Page is RELEVANT: {url}\n"
                    f"- Title: {title}\n"
                    f"- Matching keywords: {relevance_info['matching_keywords']}\n"
                    f"- Total matches: {relevance_info['total_matches']}"
                )
            else:
              logger.info(
                    f"Page is NOT RELEVANT: {url}\n"
                    f"- Title: {title}\n"
                    f"- Checked keywords: {keywords}\n"
                    f"- No keyword matches found in {relevance_info['text_length']} characters of text"
                )
            
            results = []
            if is_relevant:
                # Extract links for further crawling
                links = []
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    absolute_url = urljoin(url, href)
                    if self.is_valid_url(absolute_url):
                        links.append(absolute_url)
                
                # If page is relevant, process and download any supported files
                if self.file_handler:
                    for link in soup.find_all('a', href=True):
                        href = link['href']
                        absolute_url = urljoin(url, href)
                        if self.file_handler.is_supported_file(absolute_url):
                            downloaded_path = self.file_handler.download_file(absolute_url, source_page=url)
                            if downloaded_path:
                              logger.info(f"Downloaded file from relevant page: {absolute_url} to {downloaded_path}")
                
                # Store the relevant page information
                results.append({
                    'url': url,
                    'text': text,
                    'title': title,
                    'links': links,
                    'relevance_info': relevance_info
                })
            
            return results
            
        except Exception as e:
            logger.error(f"Error crawling {url}: {str(e)}")
            return []
    
    def save_relevance_log(self, output_dir: str):
        """Save the relevance log to a markdown file."""
        try:
            log_file = os.path.join(output_dir, 'crawl_relevance_log.md')
            with open(log_file, 'w', encoding='utf-8') as f:
                f.write("# Web Crawling Relevance Log\n\n")
                
                # Summary statistics
                total_pages = len(self.relevance_log)
                relevant_pages = sum(1 for entry in self.relevance_log if entry['relevance_info']['is_relevant'])
                
                f.write(f"## Summary\n")
                f.write(f"- Total pages crawled: {total_pages}\n")
                f.write(f"- Relevant pages found: {relevant_pages}\n")
                f.write(f"- Non-relevant pages: {total_pages - relevant_pages}\n\n")
                
                # Relevant pages
                f.write("## Relevant Pages\n\n")
                for entry in self.relevance_log:
                    if entry['relevance_info']['is_relevant']:
                        f.write(f"### {entry['title']}\n")
                        f.write(f"- URL: {entry['url']}\n")
                        f.write(f"- Matching keywords: {entry['relevance_info']['matching_keywords']}\n")
                        f.write(f"- Total matches: {entry['relevance_info']['total_matches']}\n")
                        f.write(f"- Crawled at: {entry['timestamp']}\n\n")
                
                # Non-relevant pages
                f.write("## Non-Relevant Pages\n\n")
                for entry in self.relevance_log:
                    if not entry['relevance_info']['is_relevant']:
                        f.write(f"### {entry['title']}\n")
                        f.write(f"- URL: {entry['url']}\n")
                        f.write(f"- Text length: {entry['relevance_info']['text_length']} characters\n")
                        f.write(f"- Crawled at: {entry['timestamp']}\n\n")
                
        except Exception as e:
          logger.error(f"Error saving relevance log: {str(e)}")

    def is_valid_url(self, url: str) -> bool:
        """Check if URL is valid and belongs to allowed domains."""
        try:
            parsed = urlparse(url)
            return bool(parsed.netloc and parsed.scheme in {'http', 'https'})
        except:
            return False
    
    def extract_text_and_links(self, url: str, soup: BeautifulSoup):
        """Extract relevant text and links from a page."""
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(url, href)
            links.append(absolute_url)
        return links
    
    def crawl_website(self, base_url: str, keywords: List[str]) -> List[dict]:
        """Crawl a website starting from the base URL."""
        to_visit = {base_url}
        results = []
        visited_count = 0
        
        while to_visit and visited_count < self.max_pages_per_site:
            url = to_visit.pop()
            page_results, links = self.crawl_page(url, keywords), self.extract_text_and_links(url, BeautifulSoup(requests.get(url, timeout=10).text, 'html.parser'))
            results.extend(page_results)
            
            # Add new links to visit
            domain = urlparse(base_url).netloc
            new_links = {link for link in links 
                        if urlparse(link).netloc == domain 
                        and link not in self.visited_urls}
            to_visit.update(new_links)
            visited_count += 1
        
        return results

    def crawl_all_websites(self, websites: List[str], keywords: List[str]) -> List[dict]:
        """Crawl multiple websites in parallel."""
        all_results = []
        
        if isinstance(websites, str):
            # Remove the brackets and split by comma
            websites = websites.strip('[]').replace('"', '').replace(" ","").split(',')
            # Clean up any whitespace
            websites = [url.strip("'") for url in websites]

        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {
                executor.submit(self.crawl_website, url, keywords): url 
                for url in websites
            }
            
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    results = future.result()
                    all_results.extend(results)
                    logger.info(f"Completed crawling {url}, found {len(results)} relevant pages")
                except Exception as e:
                    logger.error(f"Failed to crawl {url}: {str(e)}")
        
        return all_results

# Create the workflow
searcher = Agent(
    model=get_hf_model('searcher'),
    tools=[DuckDuckGo(fixed_max_results=DUCK_DUCK_GO_FIXED_MAX_RESULTS)],

    instructions=[
        "Given a topic, search for 20 articles and return the 15 most relevant articles.",
        "For each article, provide:",
        "- title: The article title",
        "- url: The article URL",
        "- description: A brief description or summary",
        "Return the results in a structured format with these exact field names."
    ],
    response_model=SearchResults,
    structured_outputs=True
)

backup_searcher = Agent(
    model=get_hf_model('searcher'),
    tools=[GoogleSearch()],

    instructions=[
        "Given a topic, search for 20 articles and return the 15 most relevant articles.",
        "For each article, provide:",
        "- title: The article title",
        "- url: The article URL",
        "- description: A brief description or summary",
        "Return the results in a structured format with these exact field names."
    ],
    response_model=SearchResults,
    structured_outputs=True
)

writer = Agent(
    model=get_hf_model('writer'),
    instructions=[

        "You are a professional research analyst tasked with creating a comprehensive report on the given topic.",
        "The sources provided include both general web search results and specialized intelligence/security websites.",
        "Carefully analyze and cross-reference information from all sources to create a detailed report.",
        "",
        "Report Structure:",
        "1. Executive Summary (2-3 paragraphs)",
        "   - Provide a clear, concise overview of the main findings",
        "   - Address the research question directly",
        "   - Highlight key discoveries and implications",
        "",
        "2. Detailed Analysis (Multiple sections)",
        "   - Break down the topic into relevant themes or aspects",
        "   - For each theme:",
        "     * Present detailed findings from multiple sources",
        "     * Cross-reference information between general and specialized sources",
        "     * Analyze trends, patterns, and developments",
        "     * Discuss implications and potential impacts",
        "",
        "3. Source Analysis and Credibility",
        "   For each major source:",
        "   - Evaluate source credibility and expertise",
        "   - Note if from specialized intelligence/security website",
        "   - Assess potential biases or limitations",
        "   - Key findings and unique contributions",
        "",
        "4. Key Takeaways and Strategic Implications",
        "   - Synthesize findings from all sources",
        "   - Compare/contrast general media vs specialized analysis",
        "   - Discuss broader geopolitical implications",
        "   - Address potential future developments",
        "",
        "5. References",
        "   - Group sources by type (specialized websites vs general media)",
        "   - List all sources with full citations",
        "   - Include URLs as clickable markdown links [Title](URL)",
        "   - Ensure every major claim has at least one linked source",
        "",
        "Important Guidelines:",
        "- Prioritize information from specialized intelligence/security sources",
        "- Cross-validate claims between multiple sources when possible",
        "- Maintain a professional, analytical tone",
        "- Support all claims with evidence",
        "- Include specific examples and data points",
        "- Use direct quotes for significant statements",
        "- Address potential biases in reporting",
        "- Ensure the report directly answers the research question",
        "",
        "Format the report with clear markdown headings (# ## ###), subheadings, and paragraphs.",
        "Each major section should contain multiple paragraphs with detailed analysis."
    ],
    structured_outputs=True
)

generate_blog_post = BlogPostGenerator(
    session_id=f"generate-blog-post-on-{topic}",
    searcher=searcher,
    backup_searcher=backup_searcher,
    writer=writer,
    file_handler=None,  # Initialize with None
    storage=SqlWorkflowStorage(
        table_name="generate_blog_post_workflows",
        db_file="tmp/workflows.db",
    ),
)

# Run workflow
blog_post: Iterator[RunResponse] = generate_blog_post.run(topic=topic, use_cache=False)

# Print the response
pprint_run_response(blog_post, markdown=True)