File size: 2,799 Bytes
d907837 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from asyncio import Semaphore
import re
from urllib.parse import quote_plus
from playwright.async_api import Browser
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
class GPatentsSerpBackend(PlaywrightSerpBackendBase):
"""GPatents SERP backend for scraping patent data."""
def __init__(self):
super().__init__()
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2)
pass
@property
def name(self) -> str:
"""Name of the backend. Used for identification in slugs."""
return "gpatents"
async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
"""Query the GPatents SERP page and return a list of SerpResultItem."""
# regex to locate a patent id
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
async def _block_resources(route, request):
if request.resource_type in ["stylesheet", "image"]:
await route.abort()
else:
await route.continue_()
await page.route("**/*", _block_resources)
url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}"
await page.goto(url)
# Wait for at least one search result item to appear
# This ensures the page has loaded enough to start scraping
await page.wait_for_function(
f"""() => document.querySelectorAll('search-result-item').length >= 1""",
timeout=30_000
)
items = await page.locator("search-result-item").all()
results = []
for item in items:
text = " ".join(await item.locator("span").all_inner_texts())
match = re.search(PATENT_ID_REGEX, text)
if not match:
continue
patent_id = match.group()
try:
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
except:
continue # If we can't get title or body, skip this item
results.append(SerpResultItem(
href=f"https://patents.google.com/patent/{patent_id}/en",
title=title,
body=body,
patent_id=patent_id,
content_slug=f"{self.name}:{patent_id}"
))
return results
@property
def category(self):
return "patent"
|