File size: 2,374 Bytes
d907837
71f836f
d907837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71f836f
d907837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from asyncio import Semaphore
import logging
from urllib.parse import quote_plus
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page


class BingSerpBackend(PlaywrightSerpBackendBase):

    def __init__(self):
        super().__init__()
        self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4)
        pass

    @property
    def name(self) -> str:
        return "bing"

    async def query_serp_page(self, browser, query: SerpQuery):
        async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
            async def _block_resources(route, request):
                if request.resource_type in ["stylesheet", "image"]:
                    await route.abort()
                else:
                    await route.continue_()

            await page.route("**/*", _block_resources)

            url = f"https://www.bing.com/search?q={quote_plus(query.query)}"
            logging.info(url)
            await page.goto(url)

            await page.wait_for_selector("li.b_algo")

            results = []

            items = await page.query_selector_all("li.b_algo")
            for item in items[:query.n_results]:
                title_el = await item.query_selector("h2 > a")
                url = await title_el.get_attribute("href") if title_el else None
                title = await title_el.inner_text() if title_el else ""

                snippet = ""

                # Try several fallback selectors
                for selector in [
                    "div.b_caption p",  # typical snippet
                    "div.b_caption",    # sometimes snippet is here
                    "div.b_snippet",    # used in some result types
                    "div.b_text",       # used in some panels
                    "p"                 # fallback to any paragraph
                ]:
                    snippet_el = await item.query_selector(selector)
                    if snippet_el:
                        snippet = await snippet_el.inner_text()
                        if snippet.strip():
                            break

                if title and url:
                    results.append(SerpResultItem(
                        title=title.strip(), href=url.strip(), body=snippet.strip()))

            return results

    @property
    def category(self):
        return "general"