Spaces:

re-mind
/

Crawl4AI

Running

Crawl4AI / docs /examples /quickstart_async.config.py

amaye15

test

03c0888 7 months ago

22.3 kB

	import os, sys

	sys.path.append(
	os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	)

	import asyncio
	import time
	import json
	import re
	from typing import Dict, List
	from bs4 import BeautifulSoup
	from pydantic import BaseModel, Field
	from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
	from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
	from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
	from crawl4ai.extraction_strategy import (
	JsonCssExtractionStrategy,
	LLMExtractionStrategy,
	)

	__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

	print("Crawl4AI: Advanced Web Crawling and Data Extraction")
	print("GitHub Repository: https://github.com/unclecode/crawl4ai")
	print("Twitter: @unclecode")
	print("Website: https://crawl4ai.com")


	# Basic Example - Simple Crawl
	async def simple_crawl():
	print("\n--- Basic Usage ---")
	browser_config = BrowserConfig(headless=True)
	crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business", config=crawler_config
	)
	print(result.markdown[:500])


	async def clean_content():
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	excluded_tags=["nav", "footer", "aside"],
	remove_overlay_elements=True,
	markdown_generator=DefaultMarkdownGenerator(
	content_filter=PruningContentFilter(
	threshold=0.48, threshold_type="fixed", min_word_threshold=0
	),
	options={"ignore_links": True},
	),
	)
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url="https://en.wikipedia.org/wiki/Apple",
	config=crawler_config,
	)
	full_markdown_length = len(result.markdown_v2.raw_markdown)
	fit_markdown_length = len(result.markdown_v2.fit_markdown)
	print(f"Full Markdown Length: {full_markdown_length}")
	print(f"Fit Markdown Length: {fit_markdown_length}")

	async def link_analysis():
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.ENABLED,
	exclude_external_links=True,
	exclude_social_media_links=True,
	)
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business",
	config=crawler_config,
	)
	print(f"Found {len(result.links['internal'])} internal links")
	print(f"Found {len(result.links['external'])} external links")

	for link in result.links['internal'][:5]:
	print(f"Href: {link['href']}\nText: {link['text']}\n")

	# JavaScript Execution Example
	async def simple_example_with_running_js_code():
	print("\n--- Executing JavaScript and Using CSS Selectors ---")

	browser_config = BrowserConfig(headless=True, java_script_enabled=True)

	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
	# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
	)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business", config=crawler_config
	)
	print(result.markdown[:500])


	# CSS Selector Example
	async def simple_example_with_css_selector():
	print("\n--- Using CSS Selectors ---")
	browser_config = BrowserConfig(headless=True)
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
	)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business", config=crawler_config
	)
	print(result.markdown[:500])

	async def media_handling():
	crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business",
	config=crawler_config
	)
	for img in result.media['images'][:5]:
	print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")

	async def custom_hook_workflow(verbose=True):
	async with AsyncWebCrawler() as crawler:
	# Set a 'before_goto' hook to run custom code just before navigation
	crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))

	# Perform the crawl operation
	result = await crawler.arun(
	url="https://crawl4ai.com"
	)
	print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))


	# Proxy Example
	async def use_proxy():
	print("\n--- Using a Proxy ---")
	browser_config = BrowserConfig(
	headless=True,
	proxy_config={
	"server": "http://proxy.example.com:8080",
	"username": "username",
	"password": "password",
	},
	)
	crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business", config=crawler_config
	)
	if result.success:
	print(result.markdown[:500])


	# Screenshot Example
	async def capture_and_save_screenshot(url: str, output_path: str):
	browser_config = BrowserConfig(headless=True)
	crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(url=url, config=crawler_config)

	if result.success and result.screenshot:
	import base64

	screenshot_data = base64.b64decode(result.screenshot)
	with open(output_path, "wb") as f:
	f.write(screenshot_data)
	print(f"Screenshot saved successfully to {output_path}")
	else:
	print("Failed to capture screenshot")


	# LLM Extraction Example
	class OpenAIModelFee(BaseModel):
	model_name: str = Field(..., description="Name of the OpenAI model.")
	input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
	output_fee: str = Field(
	..., description="Fee for output token for the OpenAI model."
	)


	async def extract_structured_data_using_llm(
	provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
	):
	print(f"\n--- Extracting Structured Data with {provider} ---")

	if api_token is None and provider != "ollama":
	print(f"API token is required for {provider}. Skipping this example.")
	return

	browser_config = BrowserConfig(headless=True)

	extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
	if extra_headers:
	extra_args["extra_headers"] = extra_headers

	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	word_count_threshold=1,
	page_timeout=80000,
	extraction_strategy=LLMExtractionStrategy(
	provider=provider,
	api_token=api_token,
	schema=OpenAIModelFee.model_json_schema(),
	extraction_type="schema",
	instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
	Do not miss any models in the entire content.""",
	extra_args=extra_args,
	),
	)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://openai.com/api/pricing/", config=crawler_config
	)
	print(result.extracted_content)


	# CSS Extraction Example
	async def extract_structured_data_using_css_extractor():
	print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
	schema = {
	"name": "KidoCode Courses",
	"baseSelector": "section.charge-methodology .w-tab-content > div",
	"fields": [
	{
	"name": "section_title",
	"selector": "h3.heading-50",
	"type": "text",
	},
	{
	"name": "section_description",
	"selector": ".charge-content",
	"type": "text",
	},
	{
	"name": "course_name",
	"selector": ".text-block-93",
	"type": "text",
	},
	{
	"name": "course_description",
	"selector": ".course-content-text",
	"type": "text",
	},
	{
	"name": "course_icon",
	"selector": ".image-92",
	"type": "attribute",
	"attribute": "src",
	},
	],
	}

	browser_config = BrowserConfig(headless=True, java_script_enabled=True)

	js_click_tabs = """
	(async () => {
	const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
	for(let tab of tabs) {
	tab.scrollIntoView();
	tab.click();
	await new Promise(r => setTimeout(r, 500));
	}
	})();
	"""

	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	extraction_strategy=JsonCssExtractionStrategy(schema),
	js_code=[js_click_tabs],
	)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	url="https://www.kidocode.com/degrees/technology", config=crawler_config
	)

	companies = json.loads(result.extracted_content)
	print(f"Successfully extracted {len(companies)} companies")
	print(json.dumps(companies[0], indent=2))


	# Dynamic Content Examples - Method 1
	async def crawl_dynamic_content_pages_method_1():
	print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
	first_commit = ""

	async def on_execution_started(page, **kwargs):
	nonlocal first_commit
	try:
	while True:
	await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
	commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
	commit = await commit.evaluate("(element) => element.textContent")
	commit = re.sub(r"\s+", "", commit)
	if commit and commit != first_commit:
	first_commit = commit
	break
	await asyncio.sleep(0.5)
	except Exception as e:
	print(f"Warning: New content didn't appear after JavaScript execution: {e}")

	browser_config = BrowserConfig(headless=False, java_script_enabled=True)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)

	url = "https://github.com/microsoft/TypeScript/commits/main"
	session_id = "typescript_commits_session"
	all_commits = []

	js_next_page = """
	const button = document.querySelector('a[data-testid="pagination-next-button"]');
	if (button) button.click();
	"""

	for page in range(3):
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	css_selector="li.Box-sc-g0xbh4-0",
	js_code=js_next_page if page > 0 else None,
	js_only=page > 0,
	session_id=session_id,
	)

	result = await crawler.arun(url=url, config=crawler_config)
	assert result.success, f"Failed to crawl page {page + 1}"

	soup = BeautifulSoup(result.cleaned_html, "html.parser")
	commits = soup.select("li")
	all_commits.extend(commits)

	print(f"Page {page + 1}: Found {len(commits)} commits")

	print(f"Successfully crawled {len(all_commits)} commits across 3 pages")


	# Dynamic Content Examples - Method 2
	async def crawl_dynamic_content_pages_method_2():
	print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")

	browser_config = BrowserConfig(headless=False, java_script_enabled=True)

	js_next_page_and_wait = """
	(async () => {
	const getCurrentCommit = () => {
	const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
	return commits.length > 0 ? commits[0].textContent.trim() : null;
	};

	const initialCommit = getCurrentCommit();
	const button = document.querySelector('a[data-testid="pagination-next-button"]');
	if (button) button.click();

	while (true) {
	await new Promise(resolve => setTimeout(resolve, 100));
	const newCommit = getCurrentCommit();
	if (newCommit && newCommit !== initialCommit) {
	break;
	}
	}
	})();
	"""

	schema = {
	"name": "Commit Extractor",
	"baseSelector": "li.Box-sc-g0xbh4-0",
	"fields": [
	{
	"name": "title",
	"selector": "h4.markdown-title",
	"type": "text",
	"transform": "strip",
	},
	],
	}

	async with AsyncWebCrawler(config=browser_config) as crawler:
	url = "https://github.com/microsoft/TypeScript/commits/main"
	session_id = "typescript_commits_session"
	all_commits = []

	extraction_strategy = JsonCssExtractionStrategy(schema)

	for page in range(3):
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	css_selector="li.Box-sc-g0xbh4-0",
	extraction_strategy=extraction_strategy,
	js_code=js_next_page_and_wait if page > 0 else None,
	js_only=page > 0,
	session_id=session_id,
	)

	result = await crawler.arun(url=url, config=crawler_config)
	assert result.success, f"Failed to crawl page {page + 1}"

	commits = json.loads(result.extracted_content)
	all_commits.extend(commits)
	print(f"Page {page + 1}: Found {len(commits)} commits")

	print(f"Successfully crawled {len(all_commits)} commits across 3 pages")


	async def cosine_similarity_extraction():
	crawl_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	extraction_strategy=CosineStrategy(
	word_count_threshold=10,
	max_dist=0.2, # Maximum distance between two words
	linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
	top_k=3, # Number of top keywords to extract
	sim_threshold=0.3, # Similarity threshold for clustering
	semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
	verbose=True
	),
	)
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
	config=crawl_config
	)
	print(json.loads(result.extracted_content)[:5])

	# Browser Comparison
	async def crawl_custom_browser_type():
	print("\n--- Browser Comparison ---")

	# Firefox
	browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
	start = time.time()
	async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
	result = await crawler.arun(
	url="https://www.example.com",
	config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
	)
	print("Firefox:", time.time() - start)
	print(result.markdown[:500])

	# WebKit
	browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
	start = time.time()
	async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
	result = await crawler.arun(
	url="https://www.example.com",
	config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
	)
	print("WebKit:", time.time() - start)
	print(result.markdown[:500])

	# Chromium (default)
	browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
	start = time.time()
	async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
	result = await crawler.arun(
	url="https://www.example.com",
	config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
	)
	print("Chromium:", time.time() - start)
	print(result.markdown[:500])


	# Anti-Bot and User Simulation
	async def crawl_with_user_simulation():
	browser_config = BrowserConfig(
	headless=True,
	user_agent_mode="random",
	user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
	)

	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	magic=True,
	simulate_user=True,
	override_navigator=True,
	)

	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
	print(result.markdown)

	async def ssl_certification():
	# Configure crawler to fetch SSL certificate
	config = CrawlerRunConfig(
	fetch_ssl_certificate=True,
	cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
	)

	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url='https://example.com',
	config=config
	)

	if result.success and result.ssl_certificate:
	cert = result.ssl_certificate

	# 1. Access certificate properties directly
	print("\nCertificate Information:")
	print(f"Issuer: {cert.issuer.get('CN', '')}")
	print(f"Valid until: {cert.valid_until}")
	print(f"Fingerprint: {cert.fingerprint}")

	# 2. Export certificate in different formats
	cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
	print("\nCertificate exported to:")
	print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")

	pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
	print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")

	der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
	print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")

	# Speed Comparison
	async def speed_comparison():
	print("\n--- Speed Comparison ---")

	# Firecrawl comparison
	from firecrawl import FirecrawlApp

	app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
	start = time.time()
	scrape_status = app.scrape_url(
	"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
	)
	end = time.time()
	print("Firecrawl:")
	print(f"Time taken: {end - start:.2f} seconds")
	print(f"Content length: {len(scrape_status['markdown'])} characters")
	print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
	print()

	# Crawl4AI comparisons
	browser_config = BrowserConfig(headless=True)

	# Simple crawl
	async with AsyncWebCrawler(config=browser_config) as crawler:
	start = time.time()
	result = await crawler.arun(
	url="https://www.nbcnews.com/business",
	config=CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS, word_count_threshold=0
	),
	)
	end = time.time()
	print("Crawl4AI (simple crawl):")
	print(f"Time taken: {end - start:.2f} seconds")
	print(f"Content length: {len(result.markdown)} characters")
	print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
	print()

	# Advanced filtering
	start = time.time()
	result = await crawler.arun(
	url="https://www.nbcnews.com/business",
	config=CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	word_count_threshold=0,
	markdown_generator=DefaultMarkdownGenerator(
	content_filter=PruningContentFilter(
	threshold=0.48, threshold_type="fixed", min_word_threshold=0
	)
	),
	),
	)
	end = time.time()
	print("Crawl4AI (Markdown Plus):")
	print(f"Time taken: {end - start:.2f} seconds")
	print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
	print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
	print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
	print()


	# Main execution
	async def main():
	# Basic examples
	# await simple_crawl()
	# await simple_example_with_running_js_code()
	# await simple_example_with_css_selector()

	# Advanced examples
	# await extract_structured_data_using_css_extractor()
	await extract_structured_data_using_llm(
	"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
	)
	# await crawl_dynamic_content_pages_method_1()
	# await crawl_dynamic_content_pages_method_2()

	# Browser comparisons
	# await crawl_custom_browser_type()

	# Performance testing
	# await speed_comparison()

	# Screenshot example
	# await capture_and_save_screenshot(
	# "https://www.example.com",
	# os.path.join(__location__, "tmp/example_screenshot.jpg")
	# )


	if __name__ == "__main__":
	asyncio.run(main())