File size: 6,179 Bytes
db17bc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import os
import sys
import asyncio
from typing import List, Dict, Optional
from langchain_community.tools import DuckDuckGoSearchResults
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from dotenv import load_dotenv
load_dotenv()
class AdvancedWebCrawler:
def __init__(self,
max_search_results: int = 5,
word_count_threshold: int = 50,
content_filter_type: str = 'pruning',
filter_threshold: float = 0.48):
"""
Initialize the Advanced Web Crawler
Args:
max_search_results (int): Maximum number of search results to process
word_count_threshold (int): Minimum word count for crawled content
content_filter_type (str): Type of content filter ('pruning' or 'bm25')
filter_threshold (float): Threshold for content filtering
"""
self.max_search_results = max_search_results
self.word_count_threshold = word_count_threshold
self.content_filter_type = content_filter_type
self.filter_threshold = filter_threshold
def _create_web_search_tool(self):
"""
Create a web search tool using DuckDuckGo
Returns:
DuckDuckGoSearchResults: Web search tool
"""
return DuckDuckGoSearchResults(max_results=self.max_search_results, output_format="list")
def _create_content_filter(self, user_query: Optional[str] = None):
"""
Create content filter based on specified type
Args:
user_query (Optional[str]): Query to use for BM25 filtering
Returns:
Content filter strategy
"""
if self.content_filter_type == 'bm25' and user_query:
return BM25ContentFilter(
user_query=user_query,
bm25_threshold=self.filter_threshold
)
else:
return PruningContentFilter(
threshold=self.filter_threshold,
threshold_type="fixed",
min_word_threshold=self.word_count_threshold
)
async def crawl_urls(self, urls: List[str], user_query: Optional[str] = None):
"""
Crawl multiple URLs with content filtering
Args:
urls (List[str]): List of URLs to crawl
user_query (Optional[str]): Query used for BM25 content filtering
Returns:
List of crawl results
"""
async with AsyncWebCrawler(
browser_type="chromium",
headless=True,
verbose=True
) as crawler:
# Create appropriate content filter
content_filter = self._create_content_filter(user_query)
# Run crawling for multiple URLs
results = await crawler.arun_many(
urls=urls,
word_count_threshold=self.word_count_threshold,
bypass_cache=True,
markdown_generator=DefaultMarkdownGenerator(
content_filter=content_filter
),
cache_mode=CacheMode.DISABLED,
exclude_external_links=True,
remove_overlay_elements=True,
simulate_user=True,
magic=True
)
# Process and return crawl results
processed_results = []
for result in results:
crawl_result = {
"url": result.url,
"success": result.success,
"title": result.metadata.get('title', 'N/A'),
"content": result.markdown_v2.raw_markdown if result.success else result.error_message,
"word_count": len(result.markdown_v2.raw_markdown.split()) if result.success else 0,
"links": {
"internal": len(result.links.get('internal', [])),
"external": len(result.links.get('external', []))
},
"images": len(result.media.get('images', []))
}
processed_results.append(crawl_result)
return processed_results
async def search_and_crawl(self, query: str) -> List[Dict]:
"""
Perform web search and crawl the results
Args:
query (str): Search query
Returns:
List of crawled content results
"""
# Perform web search
search_tool = self._create_web_search_tool()
try:
search_results = search_tool.invoke({"query": query})
# Extract URLs from search results
urls = [result['link'] for result in search_results]
print(f"Found {len(urls)} URLs for query: {query}")
# Crawl URLs
crawl_results = await self.crawl_urls(urls, user_query=query)
return crawl_results
except Exception as e:
print(f"Web search and crawl error: {e}")
return []
def main():
# Example usage
crawler = AdvancedWebCrawler(
max_search_results=5,
word_count_threshold=50,
content_filter_type='f',
filter_threshold=0.48
)
test_queries = [
"Latest developments in AI agents",
"Today's weather forecast in Kolkata",
]
for query in test_queries:
# Run search and crawl asynchronously
results = asyncio.run(crawler.search_and_crawl(query))
print(f"\nResults for query: {query}")
for result in results:
print(f"URL: {result['url']}")
print(f"Success: {result['success']}")
print(f"Title: {result['title']}")
print(f"Word Count: {result['word_count']}")
print(f"Content Preview: {result['content'][:500]}...\n")
if __name__ == "__main__":
main() |