SERPent2 / serp /bing.py
Game4all's picture
testing
71f836f
from asyncio import Semaphore
import logging
from urllib.parse import quote_plus
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
class BingSerpBackend(PlaywrightSerpBackendBase):
def __init__(self):
super().__init__()
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4)
pass
@property
def name(self) -> str:
return "bing"
async def query_serp_page(self, browser, query: SerpQuery):
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
async def _block_resources(route, request):
if request.resource_type in ["stylesheet", "image"]:
await route.abort()
else:
await route.continue_()
await page.route("**/*", _block_resources)
url = f"https://www.bing.com/search?q={quote_plus(query.query)}"
logging.info(url)
await page.goto(url)
await page.wait_for_selector("li.b_algo")
results = []
items = await page.query_selector_all("li.b_algo")
for item in items[:query.n_results]:
title_el = await item.query_selector("h2 > a")
url = await title_el.get_attribute("href") if title_el else None
title = await title_el.inner_text() if title_el else ""
snippet = ""
# Try several fallback selectors
for selector in [
"div.b_caption p", # typical snippet
"div.b_caption", # sometimes snippet is here
"div.b_snippet", # used in some result types
"div.b_text", # used in some panels
"p" # fallback to any paragraph
]:
snippet_el = await item.query_selector(selector)
if snippet_el:
snippet = await snippet_el.inner_text()
if snippet.strip():
break
if title and url:
results.append(SerpResultItem(
title=title.strip(), href=url.strip(), body=snippet.strip()))
return results
@property
def category(self):
return "general"