Spaces:

Game4all
/

SERPent2

Sleeping

App Files Files Community

Game4all commited on 16 days ago

Commit

d907837

0 Parent(s):

Initial commit

Browse files

Files changed (11) hide show

.gitignore +3 -0
Dockerfile +18 -0
README.md +10 -0
app.py +258 -0
scrap/base.py +17 -0
scrap/gpatents.py +83 -0
serp/arxiv.py +45 -0
serp/base.py +109 -0
serp/bing.py +65 -0
serp/duckduckgo.py +29 -0
serp/gpatents.py +78 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+.vscode
+.venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+ENV PLAYWRIGHT_BROWSERS_PATH=0
+RUN pip install --no-cache-dir playwright && \
+    playwright install-deps chromium && \
+    playwright install chromium
+COPY . .
+EXPOSE 7860
+CMD ["python", "./app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: SERPent2
+emoji: 🐍
+colorFrom: green
+colorTo: yellow
+sdk: docker
+app_port: 7860
+short_description: A SERP scrapping API for AI projects
+pinned: true
+---

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import asyncio
+from contextlib import asynccontextmanager
+import logging
+from typing import Literal, Optional
+from fastapi import FastAPI, APIRouter
+from httpx import AsyncClient
+from pydantic import BaseModel, Field
+import uvicorn
+from playwright.async_api import async_playwright, Browser
+from scrap.base import ScrapperBackendBase
+from scrap.gpatents import GpatentsScrapBackend
+from serp.base import SERPBackendBase, SerpQuery, SerpResultItem, get_backends_doc, query_serp_backend
+from serp.arxiv import ArxivSerpBackend
+from serp.bing import BingSerpBackend
+from serp.duckduckgo import DuckDuckGoSerpBackend
+from serp.gpatents import GPatentsSerpBackend
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+# playwright global context
+playwright = None
+pw_browser: Optional[Browser] = None
+# HttpX client
+http_client = AsyncClient()
+@asynccontextmanager
+async def api_lifespan(app: FastAPI):
+    """Lifespan context manager for FastAPI to manage Playwright browser instance."""
+    global playwright, pw_browser
+    playwright = await async_playwright().start()
+    pw_browser = await playwright.chromium.launch(headless=False)
+    yield
+    await pw_browser.close()
+    await playwright.stop()
+app = FastAPI(lifespan=api_lifespan)
+serp_router = APIRouter(prefix="/serp", tags=["SERP"])
+scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
+# All backends for SERP scrapping
+SERP_BACKENDS = {b.name: b for b in [DuckDuckGoSerpBackend(),
+                                     BingSerpBackend(),
+                                     ArxivSerpBackend(),
+                                     GPatentsSerpBackend()]}
+# All backends for scrap scrapping
+SCRAP_BACKENDS = {b.content_type: b for b in [
+    GpatentsScrapBackend()
+]}
+app.description = get_backends_doc(SERP_BACKENDS.values())
+# ======================== Request schemas for SERP search ================================
+class SerpRequest(BaseModel):
+    """Model for a single-query SERP search"""
+    query: SerpQuery = Field(..., description="The query to perform")
+    backend: str = Field(..., description="The backend to use for search.")
+class SerpResponse(BaseModel):
+    """Model for single-query SERP search results"""
+    error: Optional[str]
+    backend: Optional[str]
+    results: Optional[list[SerpResultItem]] = Field(
+        None, description="List of search results for the query")
+class SerpBulkRequest(BaseModel):
+    """A bulk request with many queries"""
+    queries: list[SerpQuery] = Field(...,
+                                     description="Lists of queries to perform")
+    backend: str = Field(...,
+                         description="The backend to use for the bulk search.")
+class SerpBulkResultItem(BaseModel):
+    """Intermediate result item for bulk results"""
+    query: str
+    error: Optional[str]
+    backend: Optional[str]
+    results: Optional[list[SerpResultItem]]
+    @classmethod
+    def from_err(cls, q: SerpQuery, err: Exception, backend: str):
+        return SerpBulkResultItem(query=q.query, error=str(err), results=None, backend=backend)
+    @classmethod
+    def from_results(cls, q: SerpQuery, results: list[SerpResultItem], backend: str):
+        return SerpBulkResultItem(query=q.query, error=None, results=results, backend=backend)
+class SerpBulkResponse(BaseModel):
+    """Response to a bulk query"""
+    queries: list[SerpBulkResultItem]
+class SerpAutoSearchRequest(BaseModel):
+    query: SerpQuery
+    category: Literal["general", "patent", "scholar"] = "general"
+class SerpAutoBulkSearchRequest(BaseModel):
+    """A auto-bulk request with many queries"""
+    queries: list[SerpQuery]
+    category: Literal["general", "patent", "scholar"] = "general"
+# =============================================== SERP routes ============================================
+@serp_router.post("/search")
+async def search(req: SerpRequest) -> SerpResponse:
+    """Performs a single SERP search against the given backend with the given query."""
+    # Find the backend with the given name
+    backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
+    if backend:
+        try:
+            results = await query_serp_backend(backend, req.query, http_client, pw_browser)
+            return SerpResponse(error=None, results=results, backend=backend.name)
+        except Exception as e:
+            logging.warning(f"Error while querying {backend.name}", e)
+            return SerpResponse(error=str(e), results=[], backend=backend.name)
+    return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
+@serp_router.post("/search/bulk")
+async def search_bulk(req: SerpBulkRequest) -> SerpBulkResponse:
+    """Performs a bulk SERP search against the given backend with the given queries."""
+    # Find the backend with the given name
+    backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
+    if backend:
+        logging.info(
+            f"Bulk querying {backend.name} with queries: {req.queries}")
+        results = await asyncio.gather(*[query_serp_backend(backend, q, http_client, pw_browser) for q in req.queries], return_exceptions=True)
+        for r in results:
+            if isinstance(r, Exception):
+                logging.warning(
+                    f"Exception occured while querying {backend.name}", r)
+        result_sections = [SerpBulkResultItem.from_err(q, r, backend.name)
+                           if isinstance(r, Exception) else SerpBulkResultItem.from_results(q, r, backend.name) for r, q in zip(results, req.queries)]
+        return SerpBulkResponse(queries=result_sections)
+    return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
+@serp_router.post("/fallback_search")
+async def search_fallback(auto: SerpAutoSearchRequest) -> SerpResponse:
+    """Searches the given query with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
+    logging.info(f"Auto-search with query {auto}")
+    for backend in SERP_BACKENDS.values():
+        if backend.category.lower() != auto.category.lower():
+            continue
+        try:
+            results = await query_serp_backend(backend, auto.query, http_client, pw_browser)
+            return SerpResponse(error=None, results=results, backend=backend.name)
+        except Exception as e:
+            logging.warning(f"Search with {backend.name} backend failed: {e}")
+            logging.info("Trying out query with next available backend.")
+            continue
+    return SerpResponse(error="No adequate backend found or all backends are rate-limited", results=None, backend=None)
+@serp_router.post("/fallback_search/bulk")
+async def search_fallback_bulk(auto: SerpAutoBulkSearchRequest) -> SerpBulkResponse:
+    """Bulk searches the given queries with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
+    logging.info(f"Bulk Auto-search with query {auto}")
+    async def _process_q(q: SerpQuery):
+        for backend in SERP_BACKENDS.values():
+            if backend.category.lower() != auto.category.lower():
+                continue
+            try:
+                results = await query_serp_backend(backend, q, http_client, pw_browser)
+                return (results, backend)
+            except Exception as e:
+                logging.warning(
+                    f"Search with {backend.name} backend failed: {e}")
+                continue
+        raise Exception(
+            error="No adequate backend found or all backends are rate-limited")
+    tasks = await asyncio.gather(*[_process_q(q) for q in auto.queries], return_exceptions=True)
+    result_sections = [SerpBulkResultItem.from_err(q, r[0], "") if isinstance(
+        r[0], Exception) else SerpBulkResultItem.from_results(q, r[0], r[1].name) for r, q in zip(tasks, auto.queries)]
+    return SerpBulkResponse(queries=result_sections)
+# =============================================== scrapping routes ============================================
+class FetchContentResponse(BaseModel):
+    error: Optional[str]
+    backend: Optional[str]
+    content: Optional[dict]
+@scrap_router.get("/full_contents")
+async def full_content(slug: str) -> FetchContentResponse:
+    splitted_slug = slug.split(":", 1)
+    content_type = splitted_slug[0]
+    content_id = splitted_slug[1]
+    backend: ScrapperBackendBase = SCRAP_BACKENDS.get(content_type)
+    if backend:
+        try:
+            result = await backend.scrap(http_client, content_id)
+            return FetchContentResponse(error=None, backend=backend.content_type, content=result.model_dump())
+        except Exception as e:
+            return FetchContentResponse(error=str(e), backend=backend.content_type, content=None)
+    return FetchContentResponse(error="No backend supporting found", backend="", content=None)
+@scrap_router.get("/fetch_content")
+async def fetch_content(back: str, id: str) -> FetchContentResponse:
+    backend: ScrapperBackendBase = SCRAP_BACKENDS.get(back)
+    if backend:
+        try:
+            result = await backend.scrap(http_client, id)
+            return FetchContentResponse(error=None, backend=back, content=result.model_dump())
+        except Exception as e:
+            return FetchContentResponse(error=str(e), backend=back, content=None)
+    return FetchContentResponse(error="No backend found", backend=back, content=None)
+app.include_router(serp_router)
+app.include_router(scrap_router)
+uvicorn.run(app, host="0.0.0.0", port=7860)

scrap/base.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from abc import abstractmethod
+from httpx import AsyncClient
+from pydantic import BaseModel
+class ScrapperBackendBase(BaseModel):
+    """Base class for a source scrapper"""
+    @property
+    @abstractmethod
+    def content_type(self) -> str:
+        """Type of content that this backend can scrap. Used to determine what scrap backend to use to scrap full contents."""
+        pass
+    @abstractmethod
+    async def scrap(self, client: AsyncClient, id: str) -> dict:
+        pass

scrap/gpatents.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from pydantic import BaseModel
+from scrap.base import ScrapperBackendBase
+class PatentScrapResult(BaseModel):
+    """Schema for the result of scraping a google patents page."""
+    # The title of the patent.
+    title: str
+    # The abstract of the patent, if available.
+    abstract: Optional[str] = None
+    # The full description of the patent containing the field of the invention, background, summary, etc.
+    description: Optional[str] = None
+    # The full claims of the patent.
+    claims: Optional[str] = None
+    # The field of the invention, if available.
+    field_of_invention: Optional[str] = None
+    # The background of the invention, if available.
+    background: Optional[str] = None
+class GpatentsScrapBackend(ScrapperBackendBase):
+    @property
+    def content_type(self):
+        return "patent"
+    async def scrap(self, client, id):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
+        }
+        patent_url = f"https://patents.google.com/patent/{id}/en"
+        response = await client.get(patent_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Abstract
+        abstract_div = soup.find("div", {"class": "abstract"})
+        abstract = abstract_div.get_text(
+            strip=True) if abstract_div else None
+        # Description
+        description_section = soup.find("section", itemprop="description")
+        description = description_section.get_text(
+            separator="\n", strip=True) if description_section else None
+        # Field of the Invention
+        invention_field_match = re.findall(
+            r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+        invention_field = invention_field_match[0][1].strip(
+        ) if invention_field_match else None
+        # Background of the Invention
+        invention_background_match = re.findall(
+            r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+        invention_background = invention_background_match[0][1].strip(
+        ) if invention_background_match else None
+        # Claims
+        claims_section = soup.find("section", itemprop="claims")
+        claims = claims_section.get_text(
+            separator="\n", strip=True) if claims_section else None
+        # Patent Title
+        meta_title = soup.find("meta", {"name": "DC.title"}).get(
+            "content").strip()
+        return PatentScrapResult(
+            abstract=abstract,
+            description=description,
+            claims=claims,
+            title=meta_title,
+            field_of_invention=invention_field,
+            background=invention_background
+        )
+    @property
+    def content_type(self):
+        return "patent"

serp/arxiv.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from serp.base import SERPBackendBase, SerpResultItem
+from lxml import etree
+class ArxivSerpBackend(SERPBackendBase):
+    @property
+    def name(self):
+        return "arxiv"
+    async def query(self, query, client):
+        """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
+        ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
+        ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
+        search_params = {
+            'search_query': query.query,
+            'start': 0,
+            'max_results': query.n_results,
+            'sortBy': "submittedDate" if query.sort_by == "date" else "relevance"
+        }
+        query_url = ARXIV_API_URL
+        response = await client.get(query_url, params=search_params)
+        response.raise_for_status()
+        root = etree.fromstring(response.content)
+        entries = root.findall('atom:entry', ATOM_NAMESPACE)
+        results = []
+        for entry in entries:
+            title = entry.find(
+                'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
+            id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
+            pdf_url = entry.find(
+                'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
+            summary = entry.find(
+                'atom:summary', ATOM_NAMESPACE).text.strip()
+            results.append(SerpResultItem(
+                title=title, href=pdf_url, body=summary, id=id))
+        return results
+    @property
+    def category(self):
+        return "scholar"

serp/base.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from abc import ABC, ABCMeta, abstractmethod
+from contextlib import asynccontextmanager
+import logging
+from typing import Literal, Optional
+from httpx import AsyncClient
+from pydantic import BaseModel, Field
+from playwright.async_api import Browser, BrowserContext, Page
+from asyncio import Semaphore
+# ========================== Schemas ==========================
+class SerpQuery(BaseModel):
+    """Model for SERP query"""
+    query: str = Field(
+        ..., description="The query to search for")
+    n_results: int = Field(
+        10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
+    sort_by: Literal["relevance",
+                     "date"] = Field(default="relevance", description="How to sort search results.")
+class SerpResultItem(BaseModel):
+    """Model for a single SERP result item"""
+    title: str = Field(..., description="Title of the search result")
+    href: str = Field(..., description="URL of the search result")
+    body: Optional[str] = Field(
+        None, description="Snippet of the search result")
+    content_slug: Optional[str] = Field(
+        None, description="Content slug of the search result. A slug that encodes the content type and URL that can be used to fetch the full content later")
+    class Config:
+        extra = "allow"  # Allow additional fields in the result item
+# =============================== Base classes ===============================
+class SERPBackendBase(ABC):
+    """Base class for SERP scrapping backends"""
+    def __init__(self):
+        pass
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Name of the backend. Used for identification in slugs"""
+        pass
+    @property
+    @abstractmethod
+    def category(self) -> Literal["general", "patent", "scholar"]:
+        """Content category that the backend provides. Used for search_auto """
+        pass
+    @abstractmethod
+    async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
+        """Perform a SERP query and return results"""
+        pass
+class PlaywrightSerpBackendBase(SERPBackendBase):
+    """Base class for SERP scrapping backends using Playwright"""
+    def __init__(self):
+        pass
+    async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
+        """Perform a SERP query and return results using Playwright"""
+        raise NotImplementedError("query_page method must be used instead")
+    @abstractmethod
+    async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
+        """Perform a SERP query using Playwright and return results"""
+        pass
+async def query_serp_backend(backend: SERPBackendBase, query: SerpQuery, client: AsyncClient, browser: Browser) -> list[SerpResultItem]:
+    """Queries the given backend with the given SERP query."""
+    logging.info(f"Querying {backend.name} with {query}")
+    if isinstance(backend, PlaywrightSerpBackendBase):
+        return await backend.query_serp_page(browser, query)
+    else:
+        return await backend.query(query, client)
+def get_backends_doc(backends: list[SERPBackendBase]) -> str:
+    """Retrieves all the available backends and builds a list for doc"""
+    doc_str = "### Available SERP Backends \n\n\n "
+    for backend in backends:
+        doc_str += f"   \n\n `{backend.name}` - category: `{backend.category}`"
+    return doc_str
+@asynccontextmanager
+async def playwright_open_page(browser: Browser, sema: Semaphore):
+    """Context manager for playwright pages"""
+    # Acquire the concurrency semaphore
+    await sema.acquire()
+    context: BrowserContext = await browser.new_context()
+    page: Page = await context.new_page()
+    try:
+        yield page
+    finally:
+        await page.close()
+        await context.close()
+        sema.release()

serp/bing.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from asyncio import Semaphore
+from urllib.parse import quote_plus
+from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
+class BingSerpBackend(PlaywrightSerpBackendBase):
+    def __init__(self):
+        super().__init__()
+        self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4)
+        pass
+    @property
+    def name(self) -> str:
+        return "bing"
+    async def query_serp_page(self, browser, query: SerpQuery):
+        async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
+            async def _block_resources(route, request):
+                if request.resource_type in ["stylesheet", "image"]:
+                    await route.abort()
+                else:
+                    await route.continue_()
+            await page.route("**/*", _block_resources)
+            url = f"https://www.bing.com/search?q={quote_plus(query.query)}"
+            await page.goto(url)
+            await page.wait_for_selector("li.b_algo")
+            results = []
+            items = await page.query_selector_all("li.b_algo")
+            for item in items[:query.n_results]:
+                title_el = await item.query_selector("h2 > a")
+                url = await title_el.get_attribute("href") if title_el else None
+                title = await title_el.inner_text() if title_el else ""
+                snippet = ""
+                # Try several fallback selectors
+                for selector in [
+                    "div.b_caption p",  # typical snippet
+                    "div.b_caption",    # sometimes snippet is here
+                    "div.b_snippet",    # used in some result types
+                    "div.b_text",       # used in some panels
+                    "p"                 # fallback to any paragraph
+                ]:
+                    snippet_el = await item.query_selector(selector)
+                    if snippet_el:
+                        snippet = await snippet_el.inner_text()
+                        if snippet.strip():
+                            break
+                if title and url:
+                    results.append(SerpResultItem(
+                        title=title.strip(), href=url.strip(), body=snippet.strip()))
+            return results
+    @property
+    def category(self):
+        return "general"

serp/duckduckgo.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from duckduckgo_search import DDGS
+from serp.base import SERPBackendBase, SerpResultItem
+class DuckDuckGoSerpBackend(SERPBackendBase):
+    def __init__(self):
+        self.ddg = DDGS()
+        super().__init__()
+    @property
+    def name(self):
+        return "duckduckgo"
+    async def query(self, query, client) -> list[SerpResultItem]:
+        results = []
+        for result in self.ddg.text(query.query, max_results=query.n_results):
+            results.append(SerpResultItem(
+                title=result["title"],
+                body=result["body"],
+                href=result["href"],
+                content_slug=None))
+        return results
+    @property
+    def category(self):
+        return "general"

serp/gpatents.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from asyncio import Semaphore
+import re
+from urllib.parse import quote_plus
+from playwright.async_api import Browser
+from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
+class GPatentsSerpBackend(PlaywrightSerpBackendBase):
+    """GPatents SERP backend for scraping patent data."""
+    def __init__(self):
+        super().__init__()
+        self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2)
+        pass
+    @property
+    def name(self) -> str:
+        """Name of the backend. Used for identification in slugs."""
+        return "gpatents"
+    async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
+        """Query the GPatents SERP page and return a list of SerpResultItem."""
+        # regex to locate a patent id
+        PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
+        async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
+            async def _block_resources(route, request):
+                if request.resource_type in ["stylesheet", "image"]:
+                    await route.abort()
+                else:
+                    await route.continue_()
+            await page.route("**/*", _block_resources)
+            url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}"
+            await page.goto(url)
+            # Wait for at least one search result item to appear
+            # This ensures the page has loaded enough to start scraping
+            await page.wait_for_function(
+                f"""() => document.querySelectorAll('search-result-item').length >= 1""",
+                timeout=30_000
+            )
+            items = await page.locator("search-result-item").all()
+            results = []
+            for item in items:
+                text = " ".join(await item.locator("span").all_inner_texts())
+                match = re.search(PATENT_ID_REGEX, text)
+                if not match:
+                    continue
+                patent_id = match.group()
+                try:
+                    title = await item.locator("h3, h4").first.inner_text(timeout=1000)
+                    body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
+                except:
+                    continue  # If we can't get title or body, skip this item
+                results.append(SerpResultItem(
+                    href=f"https://patents.google.com/patent/{patent_id}/en",
+                    title=title,
+                    body=body,
+                    patent_id=patent_id,
+                    content_slug=f"{self.name}:{patent_id}"
+                ))
+        return results
+    @property
+    def category(self):
+        return "patent"