Spaces:

Game4all
/

SERPent

Running

App Files Files Community

SERPent / backends.py

Game4all

Add search_duck + search route

3beb07e 17 days ago

raw

history blame

4.56 kB

	from contextlib import asynccontextmanager
	from typing import Optional
	from duckduckgo_search import DDGS
	from pydantic import BaseModel
	from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
	from urllib.parse import quote_plus
	import logging
	import re


	class APIPatentResults(BaseModel):
	"""Response of /search_patents endpoint"""
	error: Optional[str]
	results: Optional[list[dict]]


	class APISearchResults(BaseModel):
	error: Optional[str]
	results: Optional[list[dict]]


	class BraveSearchBlockedException(Exception):
	"""Dummy exception to detect when the headless browser is flagged as suspicious."""
	pass


	@asynccontextmanager
	async def playwright_open_page(browser: Browser):
	"""Context manager for playwright pages"""
	context: BrowserContext = await browser.new_context()
	page: Page = await context.new_page()
	try:
	yield page
	finally:
	await page.close()
	await context.close()


	async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
	"""Queries google patents for the specified query and number of results. Returns relevant patents"""

	async with playwright_open_page(browser) as page:

	async def _block_resources(route, request):
	if request.resource_type in ["stylesheet", "image"]:
	await route.abort()
	else:
	await route.continue_()

	await page.route("*/", _block_resources)

	url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
	await page.goto(url)

	await page.wait_for_function(
	f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
	timeout=30_000
	)

	# regex to locate a patent id
	PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"

	items = await page.locator("search-result-item").all()
	id_matches = []
	for item in items:
	all_text = " ".join(await item.locator("span").all_inner_texts())
	found = re.findall(PATENT_ID_REGEX, all_text)
	if found:
	id_matches.append(found[0])

	patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
	for id in id_matches]

	return patents[:n_results]


	async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
	"""Queries Brave Search for the specified query."""

	async with playwright_open_page(browser) as page:

	async def _block_resources(route, request):
	if request.resource_type in ["stylesheet", "image"]:
	await route.abort()
	else:
	await route.continue_()

	await page.route("*/", _block_resources)

	url = f"https://search.brave.com/search?q={quote_plus(q)}"
	await page.goto(url)

	results_cards = await page.locator('.snippet').all()

	if len(results_cards) == 0:
	logging.warning(f"No results for query: {q}")
	page_content = await page.content()

	if "suspicious" in page_content:
	logging.warning("Brave search flagged browser as suspicious.")
	raise BraveSearchBlockedException()

	results = []

	try:
	for result in results_cards:
	title = await result.locator('.title').all_inner_texts()
	description = await result.locator('.snippet-description').all_inner_texts()
	url = await result.locator('a').nth(0).get_attribute('href')

	# Filter out results with no URL or brave-specific URLs
	if url is None or url.startswith('/'):
	continue

	results.append({
	"title": title[0] if title else "",
	"body": description[0] if description else "",
	"href": url
	})

	if len(results) >= n_results:
	break

	except TimeoutError as e:
	logging.warning(
	f"Timeout on selector while parsing Brave Search SERP: {e}")

	return results


	async def query_ddg_search(q: str, n_results: int = 10):
	"""Queries duckduckgo search for the specified query"""
	ddgs = DDGS()
	results = []

	for result in ddgs.text(q, max_results=n_results):
	results.append(
	{"title": result["title"], "body": result["body"], "href": result["href"]})

	return results