Commit
·
d907837
0
Parent(s):
Initial commit
Browse files- .gitignore +3 -0
- Dockerfile +18 -0
- README.md +10 -0
- app.py +258 -0
- scrap/base.py +17 -0
- scrap/gpatents.py +83 -0
- serp/arxiv.py +45 -0
- serp/base.py +109 -0
- serp/bing.py +65 -0
- serp/duckduckgo.py +29 -0
- serp/gpatents.py +78 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.vscode
|
3 |
+
.venv
|
Dockerfile
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
|
8 |
+
ENV PLAYWRIGHT_BROWSERS_PATH=0
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir playwright && \
|
11 |
+
playwright install-deps chromium && \
|
12 |
+
playwright install chromium
|
13 |
+
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
EXPOSE 7860
|
17 |
+
|
18 |
+
CMD ["python", "./app.py"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: SERPent2
|
3 |
+
emoji: 🐍
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
+
short_description: A SERP scrapping API for AI projects
|
9 |
+
pinned: true
|
10 |
+
---
|
app.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from contextlib import asynccontextmanager
|
3 |
+
import logging
|
4 |
+
from typing import Literal, Optional
|
5 |
+
from fastapi import FastAPI, APIRouter
|
6 |
+
from httpx import AsyncClient
|
7 |
+
from pydantic import BaseModel, Field
|
8 |
+
import uvicorn
|
9 |
+
from playwright.async_api import async_playwright, Browser
|
10 |
+
|
11 |
+
from scrap.base import ScrapperBackendBase
|
12 |
+
from scrap.gpatents import GpatentsScrapBackend
|
13 |
+
from serp.base import SERPBackendBase, SerpQuery, SerpResultItem, get_backends_doc, query_serp_backend
|
14 |
+
from serp.arxiv import ArxivSerpBackend
|
15 |
+
from serp.bing import BingSerpBackend
|
16 |
+
from serp.duckduckgo import DuckDuckGoSerpBackend
|
17 |
+
from serp.gpatents import GPatentsSerpBackend
|
18 |
+
|
19 |
+
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.INFO,
|
22 |
+
format='[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]: %(message)s',
|
23 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
24 |
+
)
|
25 |
+
|
26 |
+
# playwright global context
|
27 |
+
playwright = None
|
28 |
+
pw_browser: Optional[Browser] = None
|
29 |
+
|
30 |
+
# HttpX client
|
31 |
+
http_client = AsyncClient()
|
32 |
+
|
33 |
+
|
34 |
+
@asynccontextmanager
|
35 |
+
async def api_lifespan(app: FastAPI):
|
36 |
+
"""Lifespan context manager for FastAPI to manage Playwright browser instance."""
|
37 |
+
global playwright, pw_browser
|
38 |
+
playwright = await async_playwright().start()
|
39 |
+
pw_browser = await playwright.chromium.launch(headless=False)
|
40 |
+
|
41 |
+
yield
|
42 |
+
|
43 |
+
await pw_browser.close()
|
44 |
+
await playwright.stop()
|
45 |
+
|
46 |
+
app = FastAPI(lifespan=api_lifespan)
|
47 |
+
serp_router = APIRouter(prefix="/serp", tags=["SERP"])
|
48 |
+
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
49 |
+
|
50 |
+
# All backends for SERP scrapping
|
51 |
+
SERP_BACKENDS = {b.name: b for b in [DuckDuckGoSerpBackend(),
|
52 |
+
BingSerpBackend(),
|
53 |
+
ArxivSerpBackend(),
|
54 |
+
GPatentsSerpBackend()]}
|
55 |
+
|
56 |
+
# All backends for scrap scrapping
|
57 |
+
SCRAP_BACKENDS = {b.content_type: b for b in [
|
58 |
+
GpatentsScrapBackend()
|
59 |
+
]}
|
60 |
+
|
61 |
+
app.description = get_backends_doc(SERP_BACKENDS.values())
|
62 |
+
|
63 |
+
|
64 |
+
# ======================== Request schemas for SERP search ================================
|
65 |
+
|
66 |
+
class SerpRequest(BaseModel):
|
67 |
+
"""Model for a single-query SERP search"""
|
68 |
+
query: SerpQuery = Field(..., description="The query to perform")
|
69 |
+
backend: str = Field(..., description="The backend to use for search.")
|
70 |
+
|
71 |
+
|
72 |
+
class SerpResponse(BaseModel):
|
73 |
+
"""Model for single-query SERP search results"""
|
74 |
+
error: Optional[str]
|
75 |
+
backend: Optional[str]
|
76 |
+
results: Optional[list[SerpResultItem]] = Field(
|
77 |
+
None, description="List of search results for the query")
|
78 |
+
|
79 |
+
|
80 |
+
class SerpBulkRequest(BaseModel):
|
81 |
+
"""A bulk request with many queries"""
|
82 |
+
queries: list[SerpQuery] = Field(...,
|
83 |
+
description="Lists of queries to perform")
|
84 |
+
backend: str = Field(...,
|
85 |
+
description="The backend to use for the bulk search.")
|
86 |
+
|
87 |
+
|
88 |
+
class SerpBulkResultItem(BaseModel):
|
89 |
+
"""Intermediate result item for bulk results"""
|
90 |
+
query: str
|
91 |
+
error: Optional[str]
|
92 |
+
backend: Optional[str]
|
93 |
+
results: Optional[list[SerpResultItem]]
|
94 |
+
|
95 |
+
@classmethod
|
96 |
+
def from_err(cls, q: SerpQuery, err: Exception, backend: str):
|
97 |
+
return SerpBulkResultItem(query=q.query, error=str(err), results=None, backend=backend)
|
98 |
+
|
99 |
+
@classmethod
|
100 |
+
def from_results(cls, q: SerpQuery, results: list[SerpResultItem], backend: str):
|
101 |
+
return SerpBulkResultItem(query=q.query, error=None, results=results, backend=backend)
|
102 |
+
|
103 |
+
|
104 |
+
class SerpBulkResponse(BaseModel):
|
105 |
+
"""Response to a bulk query"""
|
106 |
+
queries: list[SerpBulkResultItem]
|
107 |
+
|
108 |
+
|
109 |
+
class SerpAutoSearchRequest(BaseModel):
|
110 |
+
query: SerpQuery
|
111 |
+
category: Literal["general", "patent", "scholar"] = "general"
|
112 |
+
|
113 |
+
|
114 |
+
class SerpAutoBulkSearchRequest(BaseModel):
|
115 |
+
"""A auto-bulk request with many queries"""
|
116 |
+
queries: list[SerpQuery]
|
117 |
+
category: Literal["general", "patent", "scholar"] = "general"
|
118 |
+
|
119 |
+
# =============================================== SERP routes ============================================
|
120 |
+
|
121 |
+
|
122 |
+
@serp_router.post("/search")
|
123 |
+
async def search(req: SerpRequest) -> SerpResponse:
|
124 |
+
"""Performs a single SERP search against the given backend with the given query."""
|
125 |
+
|
126 |
+
# Find the backend with the given name
|
127 |
+
backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
|
128 |
+
|
129 |
+
if backend:
|
130 |
+
try:
|
131 |
+
results = await query_serp_backend(backend, req.query, http_client, pw_browser)
|
132 |
+
return SerpResponse(error=None, results=results, backend=backend.name)
|
133 |
+
except Exception as e:
|
134 |
+
logging.warning(f"Error while querying {backend.name}", e)
|
135 |
+
return SerpResponse(error=str(e), results=[], backend=backend.name)
|
136 |
+
|
137 |
+
return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
|
138 |
+
|
139 |
+
|
140 |
+
@serp_router.post("/search/bulk")
|
141 |
+
async def search_bulk(req: SerpBulkRequest) -> SerpBulkResponse:
|
142 |
+
"""Performs a bulk SERP search against the given backend with the given queries."""
|
143 |
+
|
144 |
+
# Find the backend with the given name
|
145 |
+
backend: SERPBackendBase = SERP_BACKENDS.get(req.backend)
|
146 |
+
|
147 |
+
if backend:
|
148 |
+
logging.info(
|
149 |
+
f"Bulk querying {backend.name} with queries: {req.queries}")
|
150 |
+
|
151 |
+
results = await asyncio.gather(*[query_serp_backend(backend, q, http_client, pw_browser) for q in req.queries], return_exceptions=True)
|
152 |
+
|
153 |
+
for r in results:
|
154 |
+
if isinstance(r, Exception):
|
155 |
+
logging.warning(
|
156 |
+
f"Exception occured while querying {backend.name}", r)
|
157 |
+
|
158 |
+
result_sections = [SerpBulkResultItem.from_err(q, r, backend.name)
|
159 |
+
if isinstance(r, Exception) else SerpBulkResultItem.from_results(q, r, backend.name) for r, q in zip(results, req.queries)]
|
160 |
+
|
161 |
+
return SerpBulkResponse(queries=result_sections)
|
162 |
+
|
163 |
+
return SerpResponse(error="No backend with the given backend name was found.", backend=req.backend, results=None)
|
164 |
+
|
165 |
+
|
166 |
+
@serp_router.post("/fallback_search")
|
167 |
+
async def search_fallback(auto: SerpAutoSearchRequest) -> SerpResponse:
|
168 |
+
"""Searches the given query with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
|
169 |
+
|
170 |
+
logging.info(f"Auto-search with query {auto}")
|
171 |
+
for backend in SERP_BACKENDS.values():
|
172 |
+
if backend.category.lower() != auto.category.lower():
|
173 |
+
continue
|
174 |
+
|
175 |
+
try:
|
176 |
+
results = await query_serp_backend(backend, auto.query, http_client, pw_browser)
|
177 |
+
return SerpResponse(error=None, results=results, backend=backend.name)
|
178 |
+
except Exception as e:
|
179 |
+
logging.warning(f"Search with {backend.name} backend failed: {e}")
|
180 |
+
logging.info("Trying out query with next available backend.")
|
181 |
+
continue
|
182 |
+
|
183 |
+
return SerpResponse(error="No adequate backend found or all backends are rate-limited", results=None, backend=None)
|
184 |
+
|
185 |
+
|
186 |
+
@serp_router.post("/fallback_search/bulk")
|
187 |
+
async def search_fallback_bulk(auto: SerpAutoBulkSearchRequest) -> SerpBulkResponse:
|
188 |
+
"""Bulk searches the given queries with the first usable backend that matches the requested content category and fallbacks to the next one if it fails"""
|
189 |
+
|
190 |
+
logging.info(f"Bulk Auto-search with query {auto}")
|
191 |
+
|
192 |
+
async def _process_q(q: SerpQuery):
|
193 |
+
for backend in SERP_BACKENDS.values():
|
194 |
+
if backend.category.lower() != auto.category.lower():
|
195 |
+
continue
|
196 |
+
|
197 |
+
try:
|
198 |
+
results = await query_serp_backend(backend, q, http_client, pw_browser)
|
199 |
+
return (results, backend)
|
200 |
+
except Exception as e:
|
201 |
+
logging.warning(
|
202 |
+
f"Search with {backend.name} backend failed: {e}")
|
203 |
+
continue
|
204 |
+
|
205 |
+
raise Exception(
|
206 |
+
error="No adequate backend found or all backends are rate-limited")
|
207 |
+
|
208 |
+
tasks = await asyncio.gather(*[_process_q(q) for q in auto.queries], return_exceptions=True)
|
209 |
+
result_sections = [SerpBulkResultItem.from_err(q, r[0], "") if isinstance(
|
210 |
+
r[0], Exception) else SerpBulkResultItem.from_results(q, r[0], r[1].name) for r, q in zip(tasks, auto.queries)]
|
211 |
+
|
212 |
+
return SerpBulkResponse(queries=result_sections)
|
213 |
+
|
214 |
+
# =============================================== scrapping routes ============================================
|
215 |
+
|
216 |
+
|
217 |
+
class FetchContentResponse(BaseModel):
|
218 |
+
error: Optional[str]
|
219 |
+
backend: Optional[str]
|
220 |
+
content: Optional[dict]
|
221 |
+
|
222 |
+
|
223 |
+
@scrap_router.get("/full_contents")
|
224 |
+
async def full_content(slug: str) -> FetchContentResponse:
|
225 |
+
splitted_slug = slug.split(":", 1)
|
226 |
+
content_type = splitted_slug[0]
|
227 |
+
content_id = splitted_slug[1]
|
228 |
+
|
229 |
+
backend: ScrapperBackendBase = SCRAP_BACKENDS.get(content_type)
|
230 |
+
|
231 |
+
if backend:
|
232 |
+
try:
|
233 |
+
result = await backend.scrap(http_client, content_id)
|
234 |
+
return FetchContentResponse(error=None, backend=backend.content_type, content=result.model_dump())
|
235 |
+
except Exception as e:
|
236 |
+
return FetchContentResponse(error=str(e), backend=backend.content_type, content=None)
|
237 |
+
|
238 |
+
return FetchContentResponse(error="No backend supporting found", backend="", content=None)
|
239 |
+
|
240 |
+
|
241 |
+
@scrap_router.get("/fetch_content")
|
242 |
+
async def fetch_content(back: str, id: str) -> FetchContentResponse:
|
243 |
+
backend: ScrapperBackendBase = SCRAP_BACKENDS.get(back)
|
244 |
+
|
245 |
+
if backend:
|
246 |
+
try:
|
247 |
+
result = await backend.scrap(http_client, id)
|
248 |
+
return FetchContentResponse(error=None, backend=back, content=result.model_dump())
|
249 |
+
except Exception as e:
|
250 |
+
return FetchContentResponse(error=str(e), backend=back, content=None)
|
251 |
+
|
252 |
+
return FetchContentResponse(error="No backend found", backend=back, content=None)
|
253 |
+
|
254 |
+
|
255 |
+
app.include_router(serp_router)
|
256 |
+
app.include_router(scrap_router)
|
257 |
+
|
258 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
scrap/base.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
+
from httpx import AsyncClient
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
|
6 |
+
class ScrapperBackendBase(BaseModel):
|
7 |
+
"""Base class for a source scrapper"""
|
8 |
+
|
9 |
+
@property
|
10 |
+
@abstractmethod
|
11 |
+
def content_type(self) -> str:
|
12 |
+
"""Type of content that this backend can scrap. Used to determine what scrap backend to use to scrap full contents."""
|
13 |
+
pass
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
async def scrap(self, client: AsyncClient, id: str) -> dict:
|
17 |
+
pass
|
scrap/gpatents.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Optional
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
from scrap.base import ScrapperBackendBase
|
7 |
+
|
8 |
+
|
9 |
+
class PatentScrapResult(BaseModel):
|
10 |
+
"""Schema for the result of scraping a google patents page."""
|
11 |
+
# The title of the patent.
|
12 |
+
title: str
|
13 |
+
# The abstract of the patent, if available.
|
14 |
+
abstract: Optional[str] = None
|
15 |
+
# The full description of the patent containing the field of the invention, background, summary, etc.
|
16 |
+
description: Optional[str] = None
|
17 |
+
# The full claims of the patent.
|
18 |
+
claims: Optional[str] = None
|
19 |
+
# The field of the invention, if available.
|
20 |
+
field_of_invention: Optional[str] = None
|
21 |
+
# The background of the invention, if available.
|
22 |
+
background: Optional[str] = None
|
23 |
+
|
24 |
+
|
25 |
+
class GpatentsScrapBackend(ScrapperBackendBase):
|
26 |
+
@property
|
27 |
+
def content_type(self):
|
28 |
+
return "patent"
|
29 |
+
|
30 |
+
async def scrap(self, client, id):
|
31 |
+
headers = {
|
32 |
+
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
|
33 |
+
}
|
34 |
+
patent_url = f"https://patents.google.com/patent/{id}/en"
|
35 |
+
response = await client.get(patent_url, headers=headers)
|
36 |
+
response.raise_for_status()
|
37 |
+
|
38 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
39 |
+
|
40 |
+
# Abstract
|
41 |
+
abstract_div = soup.find("div", {"class": "abstract"})
|
42 |
+
abstract = abstract_div.get_text(
|
43 |
+
strip=True) if abstract_div else None
|
44 |
+
|
45 |
+
# Description
|
46 |
+
description_section = soup.find("section", itemprop="description")
|
47 |
+
description = description_section.get_text(
|
48 |
+
separator="\n", strip=True) if description_section else None
|
49 |
+
|
50 |
+
# Field of the Invention
|
51 |
+
invention_field_match = re.findall(
|
52 |
+
r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
53 |
+
invention_field = invention_field_match[0][1].strip(
|
54 |
+
) if invention_field_match else None
|
55 |
+
|
56 |
+
# Background of the Invention
|
57 |
+
invention_background_match = re.findall(
|
58 |
+
r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
|
59 |
+
invention_background = invention_background_match[0][1].strip(
|
60 |
+
) if invention_background_match else None
|
61 |
+
|
62 |
+
# Claims
|
63 |
+
claims_section = soup.find("section", itemprop="claims")
|
64 |
+
claims = claims_section.get_text(
|
65 |
+
separator="\n", strip=True) if claims_section else None
|
66 |
+
|
67 |
+
# Patent Title
|
68 |
+
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
69 |
+
"content").strip()
|
70 |
+
|
71 |
+
return PatentScrapResult(
|
72 |
+
abstract=abstract,
|
73 |
+
description=description,
|
74 |
+
claims=claims,
|
75 |
+
title=meta_title,
|
76 |
+
field_of_invention=invention_field,
|
77 |
+
background=invention_background
|
78 |
+
)
|
79 |
+
|
80 |
+
@property
|
81 |
+
def content_type(self):
|
82 |
+
return "patent"
|
83 |
+
|
serp/arxiv.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from serp.base import SERPBackendBase, SerpResultItem
|
2 |
+
from lxml import etree
|
3 |
+
|
4 |
+
|
5 |
+
class ArxivSerpBackend(SERPBackendBase):
|
6 |
+
@property
|
7 |
+
def name(self):
|
8 |
+
return "arxiv"
|
9 |
+
|
10 |
+
async def query(self, query, client):
|
11 |
+
"""Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
|
12 |
+
ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
|
13 |
+
ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
|
14 |
+
|
15 |
+
search_params = {
|
16 |
+
'search_query': query.query,
|
17 |
+
'start': 0,
|
18 |
+
'max_results': query.n_results,
|
19 |
+
'sortBy': "submittedDate" if query.sort_by == "date" else "relevance"
|
20 |
+
}
|
21 |
+
query_url = ARXIV_API_URL
|
22 |
+
|
23 |
+
response = await client.get(query_url, params=search_params)
|
24 |
+
response.raise_for_status()
|
25 |
+
|
26 |
+
root = etree.fromstring(response.content)
|
27 |
+
entries = root.findall('atom:entry', ATOM_NAMESPACE)
|
28 |
+
|
29 |
+
results = []
|
30 |
+
for entry in entries:
|
31 |
+
title = entry.find(
|
32 |
+
'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
|
33 |
+
id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
|
34 |
+
pdf_url = entry.find(
|
35 |
+
'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
|
36 |
+
summary = entry.find(
|
37 |
+
'atom:summary', ATOM_NAMESPACE).text.strip()
|
38 |
+
results.append(SerpResultItem(
|
39 |
+
title=title, href=pdf_url, body=summary, id=id))
|
40 |
+
|
41 |
+
return results
|
42 |
+
|
43 |
+
@property
|
44 |
+
def category(self):
|
45 |
+
return "scholar"
|
serp/base.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, ABCMeta, abstractmethod
|
2 |
+
from contextlib import asynccontextmanager
|
3 |
+
import logging
|
4 |
+
from typing import Literal, Optional
|
5 |
+
from httpx import AsyncClient
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
from playwright.async_api import Browser, BrowserContext, Page
|
8 |
+
from asyncio import Semaphore
|
9 |
+
|
10 |
+
# ========================== Schemas ==========================
|
11 |
+
|
12 |
+
|
13 |
+
class SerpQuery(BaseModel):
|
14 |
+
"""Model for SERP query"""
|
15 |
+
query: str = Field(
|
16 |
+
..., description="The query to search for")
|
17 |
+
n_results: int = Field(
|
18 |
+
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
19 |
+
sort_by: Literal["relevance",
|
20 |
+
"date"] = Field(default="relevance", description="How to sort search results.")
|
21 |
+
|
22 |
+
|
23 |
+
class SerpResultItem(BaseModel):
|
24 |
+
"""Model for a single SERP result item"""
|
25 |
+
title: str = Field(..., description="Title of the search result")
|
26 |
+
href: str = Field(..., description="URL of the search result")
|
27 |
+
body: Optional[str] = Field(
|
28 |
+
None, description="Snippet of the search result")
|
29 |
+
content_slug: Optional[str] = Field(
|
30 |
+
None, description="Content slug of the search result. A slug that encodes the content type and URL that can be used to fetch the full content later")
|
31 |
+
|
32 |
+
class Config:
|
33 |
+
extra = "allow" # Allow additional fields in the result item
|
34 |
+
|
35 |
+
|
36 |
+
# =============================== Base classes ===============================
|
37 |
+
|
38 |
+
|
39 |
+
class SERPBackendBase(ABC):
|
40 |
+
"""Base class for SERP scrapping backends"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
pass
|
44 |
+
|
45 |
+
@property
|
46 |
+
@abstractmethod
|
47 |
+
def name(self) -> str:
|
48 |
+
"""Name of the backend. Used for identification in slugs"""
|
49 |
+
pass
|
50 |
+
|
51 |
+
@property
|
52 |
+
@abstractmethod
|
53 |
+
def category(self) -> Literal["general", "patent", "scholar"]:
|
54 |
+
"""Content category that the backend provides. Used for search_auto """
|
55 |
+
pass
|
56 |
+
|
57 |
+
@abstractmethod
|
58 |
+
async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
|
59 |
+
"""Perform a SERP query and return results"""
|
60 |
+
pass
|
61 |
+
|
62 |
+
|
63 |
+
class PlaywrightSerpBackendBase(SERPBackendBase):
|
64 |
+
"""Base class for SERP scrapping backends using Playwright"""
|
65 |
+
|
66 |
+
def __init__(self):
|
67 |
+
pass
|
68 |
+
|
69 |
+
async def query(self, query: SerpQuery, client: AsyncClient) -> list[SerpResultItem]:
|
70 |
+
"""Perform a SERP query and return results using Playwright"""
|
71 |
+
raise NotImplementedError("query_page method must be used instead")
|
72 |
+
|
73 |
+
@abstractmethod
|
74 |
+
async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
|
75 |
+
"""Perform a SERP query using Playwright and return results"""
|
76 |
+
pass
|
77 |
+
|
78 |
+
|
79 |
+
async def query_serp_backend(backend: SERPBackendBase, query: SerpQuery, client: AsyncClient, browser: Browser) -> list[SerpResultItem]:
|
80 |
+
"""Queries the given backend with the given SERP query."""
|
81 |
+
logging.info(f"Querying {backend.name} with {query}")
|
82 |
+
if isinstance(backend, PlaywrightSerpBackendBase):
|
83 |
+
return await backend.query_serp_page(browser, query)
|
84 |
+
else:
|
85 |
+
return await backend.query(query, client)
|
86 |
+
|
87 |
+
|
88 |
+
def get_backends_doc(backends: list[SERPBackendBase]) -> str:
|
89 |
+
"""Retrieves all the available backends and builds a list for doc"""
|
90 |
+
doc_str = "### Available SERP Backends \n\n\n "
|
91 |
+
for backend in backends:
|
92 |
+
doc_str += f" \n\n `{backend.name}` - category: `{backend.category}`"
|
93 |
+
|
94 |
+
return doc_str
|
95 |
+
|
96 |
+
|
97 |
+
@asynccontextmanager
|
98 |
+
async def playwright_open_page(browser: Browser, sema: Semaphore):
|
99 |
+
"""Context manager for playwright pages"""
|
100 |
+
# Acquire the concurrency semaphore
|
101 |
+
await sema.acquire()
|
102 |
+
context: BrowserContext = await browser.new_context()
|
103 |
+
page: Page = await context.new_page()
|
104 |
+
try:
|
105 |
+
yield page
|
106 |
+
finally:
|
107 |
+
await page.close()
|
108 |
+
await context.close()
|
109 |
+
sema.release()
|
serp/bing.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from asyncio import Semaphore
|
2 |
+
from urllib.parse import quote_plus
|
3 |
+
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
|
4 |
+
|
5 |
+
|
6 |
+
class BingSerpBackend(PlaywrightSerpBackendBase):
|
7 |
+
|
8 |
+
def __init__(self):
|
9 |
+
super().__init__()
|
10 |
+
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4)
|
11 |
+
pass
|
12 |
+
|
13 |
+
@property
|
14 |
+
def name(self) -> str:
|
15 |
+
return "bing"
|
16 |
+
|
17 |
+
async def query_serp_page(self, browser, query: SerpQuery):
|
18 |
+
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
|
19 |
+
async def _block_resources(route, request):
|
20 |
+
if request.resource_type in ["stylesheet", "image"]:
|
21 |
+
await route.abort()
|
22 |
+
else:
|
23 |
+
await route.continue_()
|
24 |
+
|
25 |
+
await page.route("**/*", _block_resources)
|
26 |
+
|
27 |
+
url = f"https://www.bing.com/search?q={quote_plus(query.query)}"
|
28 |
+
await page.goto(url)
|
29 |
+
|
30 |
+
await page.wait_for_selector("li.b_algo")
|
31 |
+
|
32 |
+
results = []
|
33 |
+
|
34 |
+
items = await page.query_selector_all("li.b_algo")
|
35 |
+
for item in items[:query.n_results]:
|
36 |
+
title_el = await item.query_selector("h2 > a")
|
37 |
+
url = await title_el.get_attribute("href") if title_el else None
|
38 |
+
title = await title_el.inner_text() if title_el else ""
|
39 |
+
|
40 |
+
snippet = ""
|
41 |
+
|
42 |
+
# Try several fallback selectors
|
43 |
+
for selector in [
|
44 |
+
"div.b_caption p", # typical snippet
|
45 |
+
"div.b_caption", # sometimes snippet is here
|
46 |
+
"div.b_snippet", # used in some result types
|
47 |
+
"div.b_text", # used in some panels
|
48 |
+
"p" # fallback to any paragraph
|
49 |
+
]:
|
50 |
+
snippet_el = await item.query_selector(selector)
|
51 |
+
if snippet_el:
|
52 |
+
snippet = await snippet_el.inner_text()
|
53 |
+
if snippet.strip():
|
54 |
+
break
|
55 |
+
|
56 |
+
if title and url:
|
57 |
+
results.append(SerpResultItem(
|
58 |
+
title=title.strip(), href=url.strip(), body=snippet.strip()))
|
59 |
+
|
60 |
+
return results
|
61 |
+
|
62 |
+
@property
|
63 |
+
def category(self):
|
64 |
+
return "general"
|
65 |
+
|
serp/duckduckgo.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from duckduckgo_search import DDGS
|
2 |
+
from serp.base import SERPBackendBase, SerpResultItem
|
3 |
+
|
4 |
+
|
5 |
+
class DuckDuckGoSerpBackend(SERPBackendBase):
|
6 |
+
|
7 |
+
def __init__(self):
|
8 |
+
self.ddg = DDGS()
|
9 |
+
super().__init__()
|
10 |
+
|
11 |
+
@property
|
12 |
+
def name(self):
|
13 |
+
return "duckduckgo"
|
14 |
+
|
15 |
+
async def query(self, query, client) -> list[SerpResultItem]:
|
16 |
+
results = []
|
17 |
+
|
18 |
+
for result in self.ddg.text(query.query, max_results=query.n_results):
|
19 |
+
results.append(SerpResultItem(
|
20 |
+
title=result["title"],
|
21 |
+
body=result["body"],
|
22 |
+
href=result["href"],
|
23 |
+
content_slug=None))
|
24 |
+
|
25 |
+
return results
|
26 |
+
|
27 |
+
@property
|
28 |
+
def category(self):
|
29 |
+
return "general"
|
serp/gpatents.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from asyncio import Semaphore
|
3 |
+
import re
|
4 |
+
from urllib.parse import quote_plus
|
5 |
+
from playwright.async_api import Browser
|
6 |
+
|
7 |
+
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page
|
8 |
+
|
9 |
+
|
10 |
+
class GPatentsSerpBackend(PlaywrightSerpBackendBase):
|
11 |
+
"""GPatents SERP backend for scraping patent data."""
|
12 |
+
|
13 |
+
def __init__(self):
|
14 |
+
super().__init__()
|
15 |
+
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(2)
|
16 |
+
pass
|
17 |
+
|
18 |
+
@property
|
19 |
+
def name(self) -> str:
|
20 |
+
"""Name of the backend. Used for identification in slugs."""
|
21 |
+
return "gpatents"
|
22 |
+
|
23 |
+
async def query_serp_page(self, browser: Browser, query: SerpQuery) -> list[SerpResultItem]:
|
24 |
+
"""Query the GPatents SERP page and return a list of SerpResultItem."""
|
25 |
+
|
26 |
+
# regex to locate a patent id
|
27 |
+
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
28 |
+
|
29 |
+
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page:
|
30 |
+
|
31 |
+
async def _block_resources(route, request):
|
32 |
+
if request.resource_type in ["stylesheet", "image"]:
|
33 |
+
await route.abort()
|
34 |
+
else:
|
35 |
+
await route.continue_()
|
36 |
+
|
37 |
+
await page.route("**/*", _block_resources)
|
38 |
+
|
39 |
+
url = f"https://patents.google.com/?q={quote_plus(query.query)}&num={query.n_results}"
|
40 |
+
await page.goto(url)
|
41 |
+
|
42 |
+
# Wait for at least one search result item to appear
|
43 |
+
# This ensures the page has loaded enough to start scraping
|
44 |
+
await page.wait_for_function(
|
45 |
+
f"""() => document.querySelectorAll('search-result-item').length >= 1""",
|
46 |
+
timeout=30_000
|
47 |
+
)
|
48 |
+
|
49 |
+
items = await page.locator("search-result-item").all()
|
50 |
+
results = []
|
51 |
+
for item in items:
|
52 |
+
text = " ".join(await item.locator("span").all_inner_texts())
|
53 |
+
match = re.search(PATENT_ID_REGEX, text)
|
54 |
+
if not match:
|
55 |
+
continue
|
56 |
+
|
57 |
+
patent_id = match.group()
|
58 |
+
|
59 |
+
try:
|
60 |
+
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
|
61 |
+
body = await item.locator("div.abstract, div.result-snippet, .snippet, .result-text").first.inner_text(timeout=1000)
|
62 |
+
except:
|
63 |
+
continue # If we can't get title or body, skip this item
|
64 |
+
|
65 |
+
results.append(SerpResultItem(
|
66 |
+
href=f"https://patents.google.com/patent/{patent_id}/en",
|
67 |
+
title=title,
|
68 |
+
body=body,
|
69 |
+
patent_id=patent_id,
|
70 |
+
content_slug=f"{self.name}:{patent_id}"
|
71 |
+
))
|
72 |
+
|
73 |
+
return results
|
74 |
+
|
75 |
+
@property
|
76 |
+
def category(self):
|
77 |
+
return "patent"
|
78 |
+
|