SERPent2 / scrap /gpatents.py
Game4all's picture
Initial commit
d907837
import re
from typing import Optional
from bs4 import BeautifulSoup
from pydantic import BaseModel
from scrap.base import ScrapperBackendBase
class PatentScrapResult(BaseModel):
"""Schema for the result of scraping a google patents page."""
# The title of the patent.
title: str
# The abstract of the patent, if available.
abstract: Optional[str] = None
# The full description of the patent containing the field of the invention, background, summary, etc.
description: Optional[str] = None
# The full claims of the patent.
claims: Optional[str] = None
# The field of the invention, if available.
field_of_invention: Optional[str] = None
# The background of the invention, if available.
background: Optional[str] = None
class GpatentsScrapBackend(ScrapperBackendBase):
@property
def content_type(self):
return "patent"
async def scrap(self, client, id):
headers = {
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
}
patent_url = f"https://patents.google.com/patent/{id}/en"
response = await client.get(patent_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Abstract
abstract_div = soup.find("div", {"class": "abstract"})
abstract = abstract_div.get_text(
strip=True) if abstract_div else None
# Description
description_section = soup.find("section", itemprop="description")
description = description_section.get_text(
separator="\n", strip=True) if description_section else None
# Field of the Invention
invention_field_match = re.findall(
r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
invention_field = invention_field_match[0][1].strip(
) if invention_field_match else None
# Background of the Invention
invention_background_match = re.findall(
r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
invention_background = invention_background_match[0][1].strip(
) if invention_background_match else None
# Claims
claims_section = soup.find("section", itemprop="claims")
claims = claims_section.get_text(
separator="\n", strip=True) if claims_section else None
# Patent Title
meta_title = soup.find("meta", {"name": "DC.title"}).get(
"content").strip()
return PatentScrapResult(
abstract=abstract,
description=description,
claims=claims,
title=meta_title,
field_of_invention=invention_field,
background=invention_background
)
@property
def content_type(self):
return "patent"