from asyncio import Semaphore import re from urllib.parse import quote_plus from playwright.async_api import Browser from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page class GPatentsSerpBackend(PlaywrightSerpBackendBase): """GPatents SERP backend for scraping patent data.""" def __init__(self): super().__init__() self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2) pass @property def name(self) -> str: """Name of the backend. Used for identification in slugs.""" return "gpatents" async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]: """Query the GPatents SERP page and return a list of SerpResultItem.""" # regex to locate a patent id PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b" async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page: async def _block_resources(route, request): if request.resource_type in ["stylesheet", "image"]: await route.abort() else: await route.continue_() await page.route("**/*", _block_resources) url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}" await page.goto(url) # Wait for at least one search result item to appear # This ensures the page has loaded enough to start scraping await page.wait_for_function( f"""() => document.querySelectorAll('search-result-item').length >= 1""", timeout=30_000 ) items = await page.locator("search-result-item").all() results = [] for item in items: text = " ".join(await item.locator("span").all_inner_texts()) match = re.search(PATENT_ID_REGEX, text) if not match: continue patent_id = match.group() try: title = await item.locator("h3, h4").first.inner_text(timeout=1000) body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000) except: continue # If we can't get title or body, skip this item results.append(SerpResultItem( href=f"https://patents.google.com/patent/{patent_id}/en", title=title, body=body, patent_id=patent_id, content_slug=f"{self.name}:{patent_id}" )) return results @property def category(self): return "patent"