SERPent2 / serp /gpatents.py
Game4all's picture
Initial commit
d907837
from asyncio import Semaphore
import re
from urllib.parse import quote_plus
from playwright.async_api import Browser
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
class GPatentsSerpBackend(PlaywrightSerpBackendBase):
"""GPatents SERP backend for scraping patent data."""
def __init__(self):
super().__init__()
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2)
pass
@property
def name(self) -> str:
"""Name of the backend. Used for identification in slugs."""
return "gpatents"
async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
"""Query the GPatents SERP page and return a list of SerpResultItem."""
# regex to locate a patent id
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
async def _block_resources(route, request):
if request.resource_type in ["stylesheet", "image"]:
await route.abort()
else:
await route.continue_()
await page.route("**/*", _block_resources)
url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}"
await page.goto(url)
# Wait for at least one search result item to appear
# This ensures the page has loaded enough to start scraping
await page.wait_for_function(
f"""() => document.querySelectorAll('search-result-item').length >= 1""",
timeout=30_000
)
items = await page.locator("search-result-item").all()
results = []
for item in items:
text = " ".join(await item.locator("span").all_inner_texts())
match = re.search(PATENT_ID_REGEX, text)
if not match:
continue
patent_id = match.group()
try:
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
except:
continue # If we can't get title or body, skip this item
results.append(SerpResultItem(
href=f"https://patents.google.com/patent/{patent_id}/en",
title=title,
body=body,
patent_id=patent_id,
content_slug=f"{self.name}:{patent_id}"
))
return results
@property
def category(self):
return "patent"