|
|
|
from asyncio import Semaphore |
|
import re |
|
from urllib.parse import quote_plus |
|
from playwright.async_api import Browser |
|
|
|
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page |
|
|
|
|
|
class GPatentsSerpBackend(PlaywrightSerpBackendBase): |
|
"""GPatents SERP backend for scraping patent data.""" |
|
|
|
def __init__(self): |
|
super().__init__() |
|
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2) |
|
pass |
|
|
|
@property |
|
def name(self) -> str: |
|
"""Name of the backend. Used for identification in slugs.""" |
|
return "gpatents" |
|
|
|
async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]: |
|
"""Query the GPatents SERP page and return a list of SerpResultItem.""" |
|
|
|
|
|
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b" |
|
|
|
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page: |
|
|
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}" |
|
await page.goto(url) |
|
|
|
|
|
|
|
await page.wait_for_function( |
|
f"""() => document.querySelectorAll('search-result-item').length >= 1""", |
|
timeout=30_000 |
|
) |
|
|
|
items = await page.locator("search-result-item").all() |
|
results = [] |
|
for item in items: |
|
text = " ".join(await item.locator("span").all_inner_texts()) |
|
match = re.search(PATENT_ID_REGEX, text) |
|
if not match: |
|
continue |
|
|
|
patent_id = match.group() |
|
|
|
try: |
|
title = await item.locator("h3, h4").first.inner_text(timeout=1000) |
|
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000) |
|
except: |
|
continue |
|
|
|
results.append(SerpResultItem( |
|
href=f"https://patents.google.com/patent/{patent_id}/en", |
|
title=title, |
|
body=body, |
|
patent_id=patent_id, |
|
content_slug=f"{self.name}:{patent_id}" |
|
)) |
|
|
|
return results |
|
|
|
@property |
|
def category(self): |
|
return "patent" |
|
|
|
|