import re from typing import Optional from bs4 import BeautifulSoup from pydantic import BaseModel from scrap.base import ScrapperBackendBase class PatentScrapResult(BaseModel): """Schema for the result of scraping a google patents page.""" # The title of the patent. title: str # The abstract of the patent, if available. abstract: Optional[str] = None # The full description of the patent containing the field of the invention, background, summary, etc. description: Optional[str] = None # The full claims of the patent. claims: Optional[str] = None # The field of the invention, if available. field_of_invention: Optional[str] = None # The background of the invention, if available. background: Optional[str] = None class GpatentsScrapBackend(ScrapperBackendBase): @property def content_type(self): return "patent" async def scrap(self, client, id): headers = { "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)" } patent_url = f"https://patents.google.com/patent/{id}/en" response = await client.get(patent_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Abstract abstract_div = soup.find("div", {"class": "abstract"}) abstract = abstract_div.get_text( strip=True) if abstract_div else None # Description description_section = soup.find("section", itemprop="description") description = description_section.get_text( separator="\n", strip=True) if description_section else None # Field of the Invention invention_field_match = re.findall( r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None invention_field = invention_field_match[0][1].strip( ) if invention_field_match else None # Background of the Invention invention_background_match = re.findall( r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None invention_background = invention_background_match[0][1].strip( ) if invention_background_match else None # Claims claims_section = soup.find("section", itemprop="claims") claims = claims_section.get_text( separator="\n", strip=True) if claims_section else None # Patent Title meta_title = soup.find("meta", {"name": "DC.title"}).get( "content").strip() return PatentScrapResult( abstract=abstract, description=description, claims=claims, title=meta_title, field_of_invention=invention_field, background=invention_background ) @property def content_type(self): return "patent"