Spaces:

Game4all
/

SERPent

Running

File size: 4,557 Bytes

3beb07e

from contextlib import asynccontextmanager
from typing import Optional
from duckduckgo_search import DDGS
from pydantic import BaseModel
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
from urllib.parse import quote_plus
import logging
import re


class APIPatentResults(BaseModel):
    """Response of /search_patents endpoint"""
    error: Optional[str]
    results: Optional[list[dict]]


class APISearchResults(BaseModel):
    error: Optional[str]
    results: Optional[list[dict]]


class BraveSearchBlockedException(Exception):
    """Dummy exception to detect when the headless browser is flagged as suspicious."""
    pass


@asynccontextmanager
async def playwright_open_page(browser: Browser):
    """Context manager for playwright pages"""
    context: BrowserContext = await browser.new_context()
    page: Page = await context.new_page()
    try:
        yield page
    finally:
        await page.close()
        await context.close()


async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
    """Queries google patents for the specified query and number of results. Returns relevant patents"""

    async with playwright_open_page(browser) as page:

        async def _block_resources(route, request):
            if request.resource_type in ["stylesheet", "image"]:
                await route.abort()
            else:
                await route.continue_()

        await page.route("**/*", _block_resources)

        url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
        await page.goto(url)

        await page.wait_for_function(
            f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
            timeout=30_000
        )

        # regex to locate a patent id
        PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"

        items = await page.locator("search-result-item").all()
        id_matches = []
        for item in items:
            all_text = " ".join(await item.locator("span").all_inner_texts())
            found = re.findall(PATENT_ID_REGEX, all_text)
            if found:
                id_matches.append(found[0])

        patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
                   for id in id_matches]

    return patents[:n_results]


async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
    """Queries Brave Search for the specified query."""

    async with playwright_open_page(browser) as page:

        async def _block_resources(route, request):
            if request.resource_type in ["stylesheet", "image"]:
                await route.abort()
            else:
                await route.continue_()

        await page.route("**/*", _block_resources)

        url = f"https://search.brave.com/search?q={quote_plus(q)}"
        await page.goto(url)

        results_cards = await page.locator('.snippet').all()

        if len(results_cards) == 0:
            logging.warning(f"No results for query: {q}")
            page_content = await page.content()

            if "suspicious" in page_content:
                logging.warning("Brave search flagged browser as suspicious.")
                raise BraveSearchBlockedException()

        results = []

        try:
            for result in results_cards:
                title = await result.locator('.title').all_inner_texts()
                description = await result.locator('.snippet-description').all_inner_texts()
                url = await result.locator('a').nth(0).get_attribute('href')

                # Filter out results with no URL or brave-specific URLs
                if url is None or url.startswith('/'):
                    continue

                results.append({
                    "title": title[0] if title else "",
                    "body": description[0] if description else "",
                    "href": url
                })

                if len(results) >= n_results:
                    break

        except TimeoutError as e:
            logging.warning(
                f"Timeout on selector while parsing Brave Search SERP: {e}")

        return results


async def query_ddg_search(q: str, n_results: int = 10):
    """Queries duckduckgo search for the specified query"""
    ddgs = DDGS()
    results = []

    for result in ddgs.text(q, max_results=n_results):
        results.append(
            {"title": result["title"], "body": result["body"], "href": result["href"]})

    return results