Spaces:

Game4all
/

SERPent2

Running

App Files Files Community

SERPent2 / scrap /gpatents.py

Game4all

Initial commit

d907837 16 days ago

raw

history blame contribute delete

3.21 kB

	import re
	from typing import Optional
	from bs4 import BeautifulSoup
	from pydantic import BaseModel

	from scrap.base import ScrapperBackendBase


	class PatentScrapResult(BaseModel):
	"""Schema for the result of scraping a google patents page."""
	# The title of the patent.
	title: str
	# The abstract of the patent, if available.
	abstract: Optional[str] = None
	# The full description of the patent containing the field of the invention, background, summary, etc.
	description: Optional[str] = None
	# The full claims of the patent.
	claims: Optional[str] = None
	# The field of the invention, if available.
	field_of_invention: Optional[str] = None
	# The background of the invention, if available.
	background: Optional[str] = None


	class GpatentsScrapBackend(ScrapperBackendBase):
	@property
	def content_type(self):
	return "patent"

	async def scrap(self, client, id):
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
	}
	patent_url = f"https://patents.google.com/patent/{id}/en"
	response = await client.get(patent_url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	# Abstract
	abstract_div = soup.find("div", {"class": "abstract"})
	abstract = abstract_div.get_text(
	strip=True) if abstract_div else None

	# Description
	description_section = soup.find("section", itemprop="description")
	description = description_section.get_text(
	separator="\n", strip=True) if description_section else None

	# Field of the Invention
	invention_field_match = re.findall(
	r"(FIELD OF THE INVENTION\|TECHNICAL FIELD)(.*?)(?:(BACKGROUND\|BACKGROUND OF THE INVENTION\|SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_field = invention_field_match[0][1].strip(
	) if invention_field_match else None

	# Background of the Invention
	invention_background_match = re.findall(
	r"(BACKGROUND OF THE INVENTION\|BACKGROUND)(.*?)(?:(SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE PREFERRED EMBODIMENTS\|DESCRIPTION))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_background = invention_background_match[0][1].strip(
	) if invention_background_match else None

	# Claims
	claims_section = soup.find("section", itemprop="claims")
	claims = claims_section.get_text(
	separator="\n", strip=True) if claims_section else None

	# Patent Title
	meta_title = soup.find("meta", {"name": "DC.title"}).get(
	"content").strip()

	return PatentScrapResult(
	abstract=abstract,
	description=description,
	claims=claims,
	title=meta_title,
	field_of_invention=invention_field,
	background=invention_background
	)

	@property
	def content_type(self):
	return "patent"