from asyncio import Semaphore import logging from urllib.parse import quote_plus from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page class BingSerpBackend(PlaywrightSerpBackendBase): def __init__(self): super().__init__() self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4) pass @property def name(self) -> str: return "bing" async def query_serp_page(self, browser, query: SerpQuery): async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page: async def _block_resources(route, request): if request.resource_type in ["stylesheet", "image"]: await route.abort() else: await route.continue_() await page.route("**/*", _block_resources) url = f"https://www.bing.com/search?q={quote_plus(query.query)}" logging.info(url) await page.goto(url) await page.wait_for_selector("li.b_algo") results = [] items = await page.query_selector_all("li.b_algo") for item in items[:query.n_results]: title_el = await item.query_selector("h2 > a") url = await title_el.get_attribute("href") if title_el else None title = await title_el.inner_text() if title_el else "" snippet = "" # Try several fallback selectors for selector in [ "div.b_caption p", # typical snippet "div.b_caption", # sometimes snippet is here "div.b_snippet", # used in some result types "div.b_text", # used in some panels "p" # fallback to any paragraph ]: snippet_el = await item.query_selector(selector) if snippet_el: snippet = await snippet_el.inner_text() if snippet.strip(): break if title and url: results.append(SerpResultItem( title=title.strip(), href=url.strip(), body=snippet.strip())) return results @property def category(self): return "general"