Game4all commited on
Commit
d907837
·
0 Parent(s):

Initial commit

Browse files
Files changed (11) hide show
  1. .gitignore +3 -0
  2. Dockerfile +18 -0
  3. README.md +10 -0
  4. app.py +258 -0
  5. scrap/base.py +17 -0
  6. scrap/gpatents.py +83 -0
  7. serp/arxiv.py +45 -0
  8. serp/base.py +109 -0
  9. serp/bing.py +65 -0
  10. serp/duckduckgo.py +29 -0
  11. serp/gpatents.py +78 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ .vscode
3
+ .venv
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ ENV PLAYWRIGHT_BROWSERS_PATH=0
9
+
10
+ RUN pip install --no-cache-dir playwright && \
11
+ playwright install-deps chromium && \
12
+ playwright install chromium
13
+
14
+ COPY . .
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["python", "./app.py"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SERPent2
3
+ emoji: 🐍
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: docker
7
+ app_port: 7860
8
+ short_description: A SERP scrapping API for AI projects
9
+ pinned: true
10
+ ---
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from contextlib import asynccontextmanager
3
+ import logging
4
+ from typing import Literal, Optional
5
+ from fastapi import FastAPI, APIRouter
6
+ from httpx import AsyncClient
7
+ from pydantic import BaseModel, Field
8
+ import uvicorn
9
+ from playwright.async_api import async_playwright, Browser
10
+
11
+ from scrap.base import ScrapperBackendBase
12
+ from scrap.gpatents import GpatentsScrapBackend
13
+ from serp.base import SERPBackendBase, SerpQuery, SerpResultItem, get_backends_doc, query_serp_backend
14
+ from serp.arxiv import ArxivSerpBackend
15
+ from serp.bing import BingSerpBackend
16
+ from serp.duckduckgo import DuckDuckGoSerpBackend
17
+ from serp.gpatents import GPatentsSerpBackend
18
+
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s',
23
+ datefmt='%Y-%m-%d %H:%M:%S'
24
+ )
25
+
26
+ # playwright global context
27
+ playwright = None
28
+ pw_browser: Optional[Browser] = None
29
+
30
+ # HttpX client
31
+ http_client = AsyncClient()
32
+
33
+
34
+ @asynccontextmanager
35
+ async def api_lifespan(app: FastAPI):
36
+ """Lifespan context manager for FastAPI to manage Playwright browser instance."""
37
+ global playwright, pw_browser
38
+ playwright = await async_playwright().start()
39
+ pw_browser = await playwright.chromium.launch(headless=False)
40
+
41
+ yield
42
+
43
+ await pw_browser.close()
44
+ await playwright.stop()
45
+
46
+ app = FastAPI(lifespan=api_lifespan)
47
+ serp_router = APIRouter(prefix="/serp", tags=["SERP"])
48
+ scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
49
+
50
+ # All backends for SERP scrapping
51
+ SERP_BACKENDS = {b.name: b for b in [DuckDuckGoSerpBackend(),
52
+ BingSerpBackend(),
53
+ ArxivSerpBackend(),
54
+ GPatentsSerpBackend()]}
55
+
56
+ # All backends for scrap scrapping
57
+ SCRAP_BACKENDS = {b.content_type: b for b in [
58
+ GpatentsScrapBackend()
59
+ ]}
60
+
61
+ app.description = get_backends_doc(SERP_BACKENDS.values())
62
+
63
+
64
+ # ======================== Request schemas for SERP search ================================
65
+
66
+ class SerpRequest(BaseModel):
67
+ """Model for a single-query SERP search"""
68
+ query: SerpQuery = Field(..., description="The query to perform")
69
+ backend: str = Field(..., description="The backend to use for search.")
70
+
71
+
72
+ class SerpResponse(BaseModel):
73
+ """Model for single-query SERP search results"""
74
+ error: Optional[str]
75
+ backend: Optional[str]
76
+ results: Optional[list[SerpResultItem]] = Field(
77
+ None, description="List of search results for the query")
78
+
79
+
80
+ class SerpBulkRequest(BaseModel):
81
+ """A bulk request with many queries"""
82
+ queries: list[SerpQuery] = Field(...,
83
+ description="Lists of queries to perform")
84
+ backend: str = Field(...,
85
+ description="The backend to use for the bulk search.")
86
+
87
+
88
+ class SerpBulkResultItem(BaseModel):
89
+ """Intermediate result item for bulk results"""
90
+ query: str
91
+ error: Optional[str]
92
+ backend: Optional[str]
93
+ results: Optional[list[SerpResultItem]]
94
+
95
+ @classmethod
96
+ def from_err(cls, q: SerpQuery, err: Exception, backend: str):
97
+ return SerpBulkResultItem(query=q.query, error=str(err), results=None, backend=backend)
98
+
99
+ @classmethod
100
+ def from_results(cls, q: SerpQuery, results: list[SerpResultItem], backend: str):
101
+ return SerpBulkResultItem(query=q.query, error=None, results=results, backend=backend)
102
+
103
+
104
+ class SerpBulkResponse(BaseModel):
105
+ """Response to a bulk query"""
106
+ queries: list[SerpBulkResultItem]
107
+
108
+
109
+ class SerpAutoSearchRequest(BaseModel):
110
+ query: SerpQuery
111
+ category: Literal["general", "patent", "scholar"] = "general"
112
+
113
+
114
+ class SerpAutoBulkSearchRequest(BaseModel):
115
+ """A auto-bulk request with many queries"""
116
+ queries: list[SerpQuery]
117
+ category: Literal["general", "patent", "scholar"] = "general"
118
+
119
+ # =============================================== SERP routes ============================================
120
+
121
+
122
+ @serp_router.post("/search")
123
+ async def search(req: SerpRequest) -> SerpResponse:
124
+ """Performs a single SERP search against the given backend with the given query."""
125
+
126
+ # Find the backend with the given name
127
+ backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
128
+
129
+ if backend:
130
+ try:
131
+ results = await query_serp_backend(backend, req.query, http_client, pw_browser)
132
+ return SerpResponse(error=None, results=results, backend=backend.name)
133
+ except Exception as e:
134
+ logging.warning(f"Error while querying {backend.name}", e)
135
+ return SerpResponse(error=str(e), results=[], backend=backend.name)
136
+
137
+ return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
138
+
139
+
140
+ @serp_router.post("/search/bulk")
141
+ async def search_bulk(req: SerpBulkRequest) -> SerpBulkResponse:
142
+ """Performs a bulk SERP search against the given backend with the given queries."""
143
+
144
+ # Find the backend with the given name
145
+ backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
146
+
147
+ if backend:
148
+ logging.info(
149
+ f"Bulk querying {backend.name} with queries: {req.queries}")
150
+
151
+ results = await asyncio.gather(*[query_serp_backend(backend, q, http_client, pw_browser) for q in req.queries], return_exceptions=True)
152
+
153
+ for r in results:
154
+ if isinstance(r, Exception):
155
+ logging.warning(
156
+ f"Exception occured while querying {backend.name}", r)
157
+
158
+ result_sections = [SerpBulkResultItem.from_err(q, r, backend.name)
159
+ if isinstance(r, Exception) else SerpBulkResultItem.from_results(q, r, backend.name) for r, q in zip(results, req.queries)]
160
+
161
+ return SerpBulkResponse(queries=result_sections)
162
+
163
+ return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
164
+
165
+
166
+ @serp_router.post("/fallback_search")
167
+ async def search_fallback(auto: SerpAutoSearchRequest) -> SerpResponse:
168
+ """Searches the given query with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
169
+
170
+ logging.info(f"Auto-search with query {auto}")
171
+ for backend in SERP_BACKENDS.values():
172
+ if backend.category.lower() != auto.category.lower():
173
+ continue
174
+
175
+ try:
176
+ results = await query_serp_backend(backend, auto.query, http_client, pw_browser)
177
+ return SerpResponse(error=None, results=results, backend=backend.name)
178
+ except Exception as e:
179
+ logging.warning(f"Search with {backend.name} backend failed: {e}")
180
+ logging.info("Trying out query with next available backend.")
181
+ continue
182
+
183
+ return SerpResponse(error="No adequate backend found or all backends are rate-limited", results=None, backend=None)
184
+
185
+
186
+ @serp_router.post("/fallback_search/bulk")
187
+ async def search_fallback_bulk(auto: SerpAutoBulkSearchRequest) -> SerpBulkResponse:
188
+ """Bulk searches the given queries with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
189
+
190
+ logging.info(f"Bulk Auto-search with query {auto}")
191
+
192
+ async def _process_q(q: SerpQuery):
193
+ for backend in SERP_BACKENDS.values():
194
+ if backend.category.lower() != auto.category.lower():
195
+ continue
196
+
197
+ try:
198
+ results = await query_serp_backend(backend, q, http_client, pw_browser)
199
+ return (results, backend)
200
+ except Exception as e:
201
+ logging.warning(
202
+ f"Search with {backend.name} backend failed: {e}")
203
+ continue
204
+
205
+ raise Exception(
206
+ error="No adequate backend found or all backends are rate-limited")
207
+
208
+ tasks = await asyncio.gather(*[_process_q(q) for q in auto.queries], return_exceptions=True)
209
+ result_sections = [SerpBulkResultItem.from_err(q, r[0], "") if isinstance(
210
+ r[0], Exception) else SerpBulkResultItem.from_results(q, r[0], r[1].name) for r, q in zip(tasks, auto.queries)]
211
+
212
+ return SerpBulkResponse(queries=result_sections)
213
+
214
+ # =============================================== scrapping routes ============================================
215
+
216
+
217
+ class FetchContentResponse(BaseModel):
218
+ error: Optional[str]
219
+ backend: Optional[str]
220
+ content: Optional[dict]
221
+
222
+
223
+ @scrap_router.get("/full_contents")
224
+ async def full_content(slug: str) -> FetchContentResponse:
225
+ splitted_slug = slug.split(":", 1)
226
+ content_type = splitted_slug[0]
227
+ content_id = splitted_slug[1]
228
+
229
+ backend: ScrapperBackendBase = SCRAP_BACKENDS.get(content_type)
230
+
231
+ if backend:
232
+ try:
233
+ result = await backend.scrap(http_client, content_id)
234
+ return FetchContentResponse(error=None, backend=backend.content_type, content=result.model_dump())
235
+ except Exception as e:
236
+ return FetchContentResponse(error=str(e), backend=backend.content_type, content=None)
237
+
238
+ return FetchContentResponse(error="No backend supporting found", backend="", content=None)
239
+
240
+
241
+ @scrap_router.get("/fetch_content")
242
+ async def fetch_content(back: str, id: str) -> FetchContentResponse:
243
+ backend: ScrapperBackendBase = SCRAP_BACKENDS.get(back)
244
+
245
+ if backend:
246
+ try:
247
+ result = await backend.scrap(http_client, id)
248
+ return FetchContentResponse(error=None, backend=back, content=result.model_dump())
249
+ except Exception as e:
250
+ return FetchContentResponse(error=str(e), backend=back, content=None)
251
+
252
+ return FetchContentResponse(error="No backend found", backend=back, content=None)
253
+
254
+
255
+ app.include_router(serp_router)
256
+ app.include_router(scrap_router)
257
+
258
+ uvicorn.run(app, host="0.0.0.0", port=7860)
scrap/base.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from httpx import AsyncClient
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class ScrapperBackendBase(BaseModel):
7
+ """Base class for a source scrapper"""
8
+
9
+ @property
10
+ @abstractmethod
11
+ def content_type(self) -> str:
12
+ """Type of content that this backend can scrap. Used to determine what scrap backend to use to scrap full contents."""
13
+ pass
14
+
15
+ @abstractmethod
16
+ async def scrap(self, client: AsyncClient, id: str) -> dict:
17
+ pass
scrap/gpatents.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Optional
3
+ from bs4 import BeautifulSoup
4
+ from pydantic import BaseModel
5
+
6
+ from scrap.base import ScrapperBackendBase
7
+
8
+
9
+ class PatentScrapResult(BaseModel):
10
+ """Schema for the result of scraping a google patents page."""
11
+ # The title of the patent.
12
+ title: str
13
+ # The abstract of the patent, if available.
14
+ abstract: Optional[str] = None
15
+ # The full description of the patent containing the field of the invention, background, summary, etc.
16
+ description: Optional[str] = None
17
+ # The full claims of the patent.
18
+ claims: Optional[str] = None
19
+ # The field of the invention, if available.
20
+ field_of_invention: Optional[str] = None
21
+ # The background of the invention, if available.
22
+ background: Optional[str] = None
23
+
24
+
25
+ class GpatentsScrapBackend(ScrapperBackendBase):
26
+ @property
27
+ def content_type(self):
28
+ return "patent"
29
+
30
+ async def scrap(self, client, id):
31
+ headers = {
32
+ "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
33
+ }
34
+ patent_url = f"https://patents.google.com/patent/{id}/en"
35
+ response = await client.get(patent_url, headers=headers)
36
+ response.raise_for_status()
37
+
38
+ soup = BeautifulSoup(response.text, "html.parser")
39
+
40
+ # Abstract
41
+ abstract_div = soup.find("div", {"class": "abstract"})
42
+ abstract = abstract_div.get_text(
43
+ strip=True) if abstract_div else None
44
+
45
+ # Description
46
+ description_section = soup.find("section", itemprop="description")
47
+ description = description_section.get_text(
48
+ separator="\n", strip=True) if description_section else None
49
+
50
+ # Field of the Invention
51
+ invention_field_match = re.findall(
52
+ r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
53
+ invention_field = invention_field_match[0][1].strip(
54
+ ) if invention_field_match else None
55
+
56
+ # Background of the Invention
57
+ invention_background_match = re.findall(
58
+ r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
59
+ invention_background = invention_background_match[0][1].strip(
60
+ ) if invention_background_match else None
61
+
62
+ # Claims
63
+ claims_section = soup.find("section", itemprop="claims")
64
+ claims = claims_section.get_text(
65
+ separator="\n", strip=True) if claims_section else None
66
+
67
+ # Patent Title
68
+ meta_title = soup.find("meta", {"name": "DC.title"}).get(
69
+ "content").strip()
70
+
71
+ return PatentScrapResult(
72
+ abstract=abstract,
73
+ description=description,
74
+ claims=claims,
75
+ title=meta_title,
76
+ field_of_invention=invention_field,
77
+ background=invention_background
78
+ )
79
+
80
+ @property
81
+ def content_type(self):
82
+ return "patent"
83
+
serp/arxiv.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from serp.base import SERPBackendBase, SerpResultItem
2
+ from lxml import etree
3
+
4
+
5
+ class ArxivSerpBackend(SERPBackendBase):
6
+ @property
7
+ def name(self):
8
+ return "arxiv"
9
+
10
+ async def query(self, query, client):
11
+ """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
12
+ ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
13
+ ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
14
+
15
+ search_params = {
16
+ 'search_query': query.query,
17
+ 'start': 0,
18
+ 'max_results': query.n_results,
19
+ 'sortBy': "submittedDate" if query.sort_by == "date" else "relevance"
20
+ }
21
+ query_url = ARXIV_API_URL
22
+
23
+ response = await client.get(query_url, params=search_params)
24
+ response.raise_for_status()
25
+
26
+ root = etree.fromstring(response.content)
27
+ entries = root.findall('atom:entry', ATOM_NAMESPACE)
28
+
29
+ results = []
30
+ for entry in entries:
31
+ title = entry.find(
32
+ 'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
33
+ id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
34
+ pdf_url = entry.find(
35
+ 'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
36
+ summary = entry.find(
37
+ 'atom:summary', ATOM_NAMESPACE).text.strip()
38
+ results.append(SerpResultItem(
39
+ title=title, href=pdf_url, body=summary, id=id))
40
+
41
+ return results
42
+
43
+ @property
44
+ def category(self):
45
+ return "scholar"
serp/base.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, ABCMeta, abstractmethod
2
+ from contextlib import asynccontextmanager
3
+ import logging
4
+ from typing import Literal, Optional
5
+ from httpx import AsyncClient
6
+ from pydantic import BaseModel, Field
7
+ from playwright.async_api import Browser, BrowserContext, Page
8
+ from asyncio import Semaphore
9
+
10
+ # ========================== Schemas ==========================
11
+
12
+
13
+ class SerpQuery(BaseModel):
14
+ """Model for SERP query"""
15
+ query: str = Field(
16
+ ..., description="The query to search for")
17
+ n_results: int = Field(
18
+ 10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
19
+ sort_by: Literal["relevance",
20
+ "date"] = Field(default="relevance", description="How to sort search results.")
21
+
22
+
23
+ class SerpResultItem(BaseModel):
24
+ """Model for a single SERP result item"""
25
+ title: str = Field(..., description="Title of the search result")
26
+ href: str = Field(..., description="URL of the search result")
27
+ body: Optional[str] = Field(
28
+ None, description="Snippet of the search result")
29
+ content_slug: Optional[str] = Field(
30
+ None, description="Content slug of the search result. A slug that encodes the content type and URL that can be used to fetch the full content later")
31
+
32
+ class Config:
33
+ extra = "allow" # Allow additional fields in the result item
34
+
35
+
36
+ # =============================== Base classes ===============================
37
+
38
+
39
+ class SERPBackendBase(ABC):
40
+ """Base class for SERP scrapping backends"""
41
+
42
+ def __init__(self):
43
+ pass
44
+
45
+ @property
46
+ @abstractmethod
47
+ def name(self) -> str:
48
+ """Name of the backend. Used for identification in slugs"""
49
+ pass
50
+
51
+ @property
52
+ @abstractmethod
53
+ def category(self) -> Literal["general", "patent", "scholar"]:
54
+ """Content category that the backend provides. Used for search_auto """
55
+ pass
56
+
57
+ @abstractmethod
58
+ async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
59
+ """Perform a SERP query and return results"""
60
+ pass
61
+
62
+
63
+ class PlaywrightSerpBackendBase(SERPBackendBase):
64
+ """Base class for SERP scrapping backends using Playwright"""
65
+
66
+ def __init__(self):
67
+ pass
68
+
69
+ async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
70
+ """Perform a SERP query and return results using Playwright"""
71
+ raise NotImplementedError("query_page method must be used instead")
72
+
73
+ @abstractmethod
74
+ async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
75
+ """Perform a SERP query using Playwright and return results"""
76
+ pass
77
+
78
+
79
+ async def query_serp_backend(backend: SERPBackendBase, query: SerpQuery, client: AsyncClient, browser: Browser) -> list[SerpResultItem]:
80
+ """Queries the given backend with the given SERP query."""
81
+ logging.info(f"Querying {backend.name} with {query}")
82
+ if isinstance(backend, PlaywrightSerpBackendBase):
83
+ return await backend.query_serp_page(browser, query)
84
+ else:
85
+ return await backend.query(query, client)
86
+
87
+
88
+ def get_backends_doc(backends: list[SERPBackendBase]) -> str:
89
+ """Retrieves all the available backends and builds a list for doc"""
90
+ doc_str = "### Available SERP Backends \n\n\n "
91
+ for backend in backends:
92
+ doc_str += f" \n\n `{backend.name}` - category: `{backend.category}`"
93
+
94
+ return doc_str
95
+
96
+
97
+ @asynccontextmanager
98
+ async def playwright_open_page(browser: Browser, sema: Semaphore):
99
+ """Context manager for playwright pages"""
100
+ # Acquire the concurrency semaphore
101
+ await sema.acquire()
102
+ context: BrowserContext = await browser.new_context()
103
+ page: Page = await context.new_page()
104
+ try:
105
+ yield page
106
+ finally:
107
+ await page.close()
108
+ await context.close()
109
+ sema.release()
serp/bing.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import Semaphore
2
+ from urllib.parse import quote_plus
3
+ from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
4
+
5
+
6
+ class BingSerpBackend(PlaywrightSerpBackendBase):
7
+
8
+ def __init__(self):
9
+ super().__init__()
10
+ self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4)
11
+ pass
12
+
13
+ @property
14
+ def name(self) -> str:
15
+ return "bing"
16
+
17
+ async def query_serp_page(self, browser, query: SerpQuery):
18
+ async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
19
+ async def _block_resources(route, request):
20
+ if request.resource_type in ["stylesheet", "image"]:
21
+ await route.abort()
22
+ else:
23
+ await route.continue_()
24
+
25
+ await page.route("**/*", _block_resources)
26
+
27
+ url = f"https://www.bing.com/search?q={quote_plus(query.query)}"
28
+ await page.goto(url)
29
+
30
+ await page.wait_for_selector("li.b_algo")
31
+
32
+ results = []
33
+
34
+ items = await page.query_selector_all("li.b_algo")
35
+ for item in items[:query.n_results]:
36
+ title_el = await item.query_selector("h2 > a")
37
+ url = await title_el.get_attribute("href") if title_el else None
38
+ title = await title_el.inner_text() if title_el else ""
39
+
40
+ snippet = ""
41
+
42
+ # Try several fallback selectors
43
+ for selector in [
44
+ "div.b_caption p", # typical snippet
45
+ "div.b_caption", # sometimes snippet is here
46
+ "div.b_snippet", # used in some result types
47
+ "div.b_text", # used in some panels
48
+ "p" # fallback to any paragraph
49
+ ]:
50
+ snippet_el = await item.query_selector(selector)
51
+ if snippet_el:
52
+ snippet = await snippet_el.inner_text()
53
+ if snippet.strip():
54
+ break
55
+
56
+ if title and url:
57
+ results.append(SerpResultItem(
58
+ title=title.strip(), href=url.strip(), body=snippet.strip()))
59
+
60
+ return results
61
+
62
+ @property
63
+ def category(self):
64
+ return "general"
65
+
serp/duckduckgo.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from duckduckgo_search import DDGS
2
+ from serp.base import SERPBackendBase, SerpResultItem
3
+
4
+
5
+ class DuckDuckGoSerpBackend(SERPBackendBase):
6
+
7
+ def __init__(self):
8
+ self.ddg = DDGS()
9
+ super().__init__()
10
+
11
+ @property
12
+ def name(self):
13
+ return "duckduckgo"
14
+
15
+ async def query(self, query, client) -> list[SerpResultItem]:
16
+ results = []
17
+
18
+ for result in self.ddg.text(query.query, max_results=query.n_results):
19
+ results.append(SerpResultItem(
20
+ title=result["title"],
21
+ body=result["body"],
22
+ href=result["href"],
23
+ content_slug=None))
24
+
25
+ return results
26
+
27
+ @property
28
+ def category(self):
29
+ return "general"
serp/gpatents.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from asyncio import Semaphore
3
+ import re
4
+ from urllib.parse import quote_plus
5
+ from playwright.async_api import Browser
6
+
7
+ from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
8
+
9
+
10
+ class GPatentsSerpBackend(PlaywrightSerpBackendBase):
11
+ """GPatents SERP backend for scraping patent data."""
12
+
13
+ def __init__(self):
14
+ super().__init__()
15
+ self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2)
16
+ pass
17
+
18
+ @property
19
+ def name(self) -> str:
20
+ """Name of the backend. Used for identification in slugs."""
21
+ return "gpatents"
22
+
23
+ async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
24
+ """Query the GPatents SERP page and return a list of SerpResultItem."""
25
+
26
+ # regex to locate a patent id
27
+ PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
28
+
29
+ async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
30
+
31
+ async def _block_resources(route, request):
32
+ if request.resource_type in ["stylesheet", "image"]:
33
+ await route.abort()
34
+ else:
35
+ await route.continue_()
36
+
37
+ await page.route("**/*", _block_resources)
38
+
39
+ url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}"
40
+ await page.goto(url)
41
+
42
+ # Wait for at least one search result item to appear
43
+ # This ensures the page has loaded enough to start scraping
44
+ await page.wait_for_function(
45
+ f"""() => document.querySelectorAll('search-result-item').length >= 1""",
46
+ timeout=30_000
47
+ )
48
+
49
+ items = await page.locator("search-result-item").all()
50
+ results = []
51
+ for item in items:
52
+ text = " ".join(await item.locator("span").all_inner_texts())
53
+ match = re.search(PATENT_ID_REGEX, text)
54
+ if not match:
55
+ continue
56
+
57
+ patent_id = match.group()
58
+
59
+ try:
60
+ title = await item.locator("h3, h4").first.inner_text(timeout=1000)
61
+ body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
62
+ except:
63
+ continue # If we can't get title or body, skip this item
64
+
65
+ results.append(SerpResultItem(
66
+ href=f"https://patents.google.com/patent/{patent_id}/en",
67
+ title=title,
68
+ body=body,
69
+ patent_id=patent_id,
70
+ content_slug=f"{self.name}:{patent_id}"
71
+ ))
72
+
73
+ return results
74
+
75
+ @property
76
+ def category(self):
77
+ return "patent"
78
+