File size: 4,557 Bytes
3beb07e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from contextlib import asynccontextmanager
from typing import Optional
from duckduckgo_search import DDGS
from pydantic import BaseModel
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
from urllib.parse import quote_plus
import logging
import re


class APIPatentResults(BaseModel):
    """Response of /search_patents endpoint"""
    error: Optional[str]
    results: Optional[list[dict]]


class APISearchResults(BaseModel):
    error: Optional[str]
    results: Optional[list[dict]]


class BraveSearchBlockedException(Exception):
    """Dummy exception to detect when the headless browser is flagged as suspicious."""
    pass


@asynccontextmanager
async def playwright_open_page(browser: Browser):
    """Context manager for playwright pages"""
    context: BrowserContext = await browser.new_context()
    page: Page = await context.new_page()
    try:
        yield page
    finally:
        await page.close()
        await context.close()


async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
    """Queries google patents for the specified query and number of results. Returns relevant patents"""

    async with playwright_open_page(browser) as page:

        async def _block_resources(route, request):
            if request.resource_type in ["stylesheet", "image"]:
                await route.abort()
            else:
                await route.continue_()

        await page.route("**/*", _block_resources)

        url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
        await page.goto(url)

        await page.wait_for_function(
            f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
            timeout=30_000
        )

        # regex to locate a patent id
        PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"

        items = await page.locator("search-result-item").all()
        id_matches = []
        for item in items:
            all_text = " ".join(await item.locator("span").all_inner_texts())
            found = re.findall(PATENT_ID_REGEX, all_text)
            if found:
                id_matches.append(found[0])

        patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
                   for id in id_matches]

    return patents[:n_results]


async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
    """Queries Brave Search for the specified query."""

    async with playwright_open_page(browser) as page:

        async def _block_resources(route, request):
            if request.resource_type in ["stylesheet", "image"]:
                await route.abort()
            else:
                await route.continue_()

        await page.route("**/*", _block_resources)

        url = f"https://search.brave.com/search?q={quote_plus(q)}"
        await page.goto(url)

        results_cards = await page.locator('.snippet').all()

        if len(results_cards) == 0:
            logging.warning(f"No results for query: {q}")
            page_content = await page.content()

            if "suspicious" in page_content:
                logging.warning("Brave search flagged browser as suspicious.")
                raise BraveSearchBlockedException()

        results = []

        try:
            for result in results_cards:
                title = await result.locator('.title').all_inner_texts()
                description = await result.locator('.snippet-description').all_inner_texts()
                url = await result.locator('a').nth(0).get_attribute('href')

                # Filter out results with no URL or brave-specific URLs
                if url is None or url.startswith('/'):
                    continue

                results.append({
                    "title": title[0] if title else "",
                    "body": description[0] if description else "",
                    "href": url
                })

                if len(results) >= n_results:
                    break

        except TimeoutError as e:
            logging.warning(
                f"Timeout on selector while parsing Brave Search SERP: {e}")

        return results


async def query_ddg_search(q: str, n_results: int = 10):
    """Queries duckduckgo search for the specified query"""
    ddgs = DDGS()
    results = []

    for result in ddgs.text(q, max_results=n_results):
        results.append(
            {"title": result["title"], "body": result["body"], "href": result["href"]})

    return results