Spaces:

Game4all
/

SERPent

Running

App Files Files Community

Game4all commited on 13 days ago

Commit

3beb07e

1 Parent(s): 551703a

Add search_duck + search route

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +64 -93
backends.py +140 -0
requirements.txt +3 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ .vscode

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
 from contextlib import asynccontextmanager
 from typing import Optional
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
@@ -8,11 +12,13 @@ import logging
 import re
 import uvicorn
 logging.basicConfig(level=logging.INFO)
 # playwright global context
 playwright = None
-pw_browser: Browser = None
 @asynccontextmanager
@@ -27,6 +33,7 @@ async def api_lifespan(app: FastAPI):
     await playwright.stop()
 app = FastAPI(lifespan=api_lifespan)
 class APISearchParams(BaseModel):
@@ -36,95 +43,6 @@ class APISearchParams(BaseModel):
         10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
-class APIPatentResults(BaseModel):
-    """Response of /search_patents endpoint"""
-    error: Optional[str]
-    results: Optional[list[dict]]
-class APIBraveResults(BaseModel):
-    """Response of /search_brave endpoint"""
-    error: Optional[str]
-    results: Optional[list[dict]]
-async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
-    """Queries google patents for the specified query and number of results. Returns relevant patents"""
-    context: BrowserContext = await browser.new_context()
-    page: Page = await context.new_page()
-    async def _block_resources(route, request):
-        if request.resource_type in ["stylesheet", "image"]:
-            await route.abort()
-        else:
-            await route.continue_()
-    await page.route("**/*", _block_resources)
-    url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
-    await page.goto(url)
-    await page.wait_for_function(
-        f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
-        timeout=30_000
-    )
-    # regex to locate a patent id
-    PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
-    items = await page.locator("search-result-item").all()
-    id_matches = []
-    for item in items:
-        all_text = " ".join(await item.locator("span").all_inner_texts())
-        found = re.findall(PATENT_ID_REGEX, all_text)
-        if found:
-            id_matches.append(found[0])
-    await context.close()
-    patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
-               for id in id_matches]
-    return patents[:n_results]
-async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
-    """Queries brave search for the specified query"""
-    context: BrowserContext = await browser.new_context()
-    page: Page = await context.new_page()
-    async def _block_resources(route, request):
-        if request.resource_type in ["stylesheet", "image"]:
-            await route.abort()
-        else:
-            await route.continue_()
-    await page.route("**/*", _block_resources)
-    url = f"https://search.brave.com/search?q={quote_plus(q)}"
-    await page.goto(url)
-    results_cards = await page.locator('.snippet').all()
-    if len(results_cards) == 0:
-        logging.warning(f"No results for query: {q}")
-        logging.warning(await page.content())
-    results = []
-    for result in results_cards:
-        title = await result.locator('.title').all_inner_texts()
-        description = await result.locator('.snippet-description').all_inner_texts()
-        url = await result.locator('a').nth(0).get_attribute('href')
-        if url.startswith('/'):
-            continue
-        results.append({"title": title[0] if len(title) > 0 else "", "body": description[0] if len(
-            description) > 0 else "",  "href": url})
-    return results[:n_results]
 @app.post("/search_scholar")
 async def query_google_scholar(params: APISearchParams):
     """Queries google scholar for the specified query"""
@@ -133,7 +51,7 @@ async def query_google_scholar(params: APISearchParams):
 @app.get('/')
 async def status():
-    return {"status": "running"}
 @app.post("/search_patents")
@@ -146,24 +64,77 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
             res = await query_google_patents(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
             logging.error(
                 f"Failed to query Google Patents with query `{q}`: {e}")
     return APIPatentResults(results=results, error=None)
 @app.post("/search_brave")
-async def search_brave(params: APISearchParams) -> APIBraveResults:
     """Searches brave search for the specified queries and returns the found documents."""
     results = []
     for q in params.queries:
         logging.info(f"Searching Brave search with query `{q}`")
         try:
             res = await query_brave_search(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
             logging.error(
                 f"Failed to query Brave search with query `{q}`: {e}")
-    return APIBraveResults(results=results, error=None)
 uvicorn.run(app, host="0.0.0.0", port=7860)

 from contextlib import asynccontextmanager
+import json
 from typing import Optional
+from duckduckgo_search import DDGS
+from duckduckgo_search.exceptions import RatelimitException
+import expiringdict
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 import re
 import uvicorn
+from backends import APISearchResults, APIPatentResults, query_brave_search, query_ddg_search, query_google_patents
 logging.basicConfig(level=logging.INFO)
 # playwright global context
 playwright = None
+pw_browser: Optional[Browser] = None
 @asynccontextmanager
     await playwright.stop()
 app = FastAPI(lifespan=api_lifespan)
+backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
 class APISearchParams(BaseModel):
         10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
 @app.post("/search_scholar")
 async def query_google_scholar(params: APISearchParams):
     """Queries google scholar for the specified query"""
 @app.get('/')
 async def status():
+    return {"status": "running", "backend_status": backend_status.items_with_timestamp()}
 @app.post("/search_patents")
             res = await query_google_patents(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
+            backend_status["gpatents"] = "rate-limited"
             logging.error(
                 f"Failed to query Google Patents with query `{q}`: {e}")
     return APIPatentResults(results=results, error=None)
 @app.post("/search_brave")
+async def search_brave(params: APISearchParams) -> APISearchResults:
     """Searches brave search for the specified queries and returns the found documents."""
     results = []
+    last_exception: Optional[Exception] = None
     for q in params.queries:
         logging.info(f"Searching Brave search with query `{q}`")
         try:
             res = await query_brave_search(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
+            last_exception = e
+            backend_status["brave"] = "rate-limited"
             logging.error(
                 f"Failed to query Brave search with query `{q}`: {e}")
+    return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
+@app.post("/search_duck")
+async def search_duck(params: APISearchParams) -> APISearchResults:
+    """Searches duckduckgo for the specified queries and returns the found documents"""
+    results = []
+    last_exception: Optional[Exception] = None
+    for q in params.queries:
+        logging.info(f"Querying DDG with query: `{q}`")
+        try:
+            res = await query_ddg_search(q, params.n_results)
+            results.extend(res)
+        except Exception as e:
+            last_exception = e
+            backend_status["duckduckgo"] = "rate-limited"
+            logging.error(f"Failed to query DDG with query `{q}`: {e}")
+    return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
+@app.post("/search")
+async def search(params: APISearchParams):
+    """Attempts to search the specified queries using ALL backends"""
+    results = []
+    for q in params.queries:
+        try:
+            logging.info(f"Querying DDG with query: `{q}`")
+            res = await query_ddg_search(q, params.n_results)
+            results.extend(res)
+            continue
+        except Exception as e:
+            logging.error(f"Failed to query DDG with query `{q}`: {e}")
+            logging.info("Trying with next browser backend.")
+        try:
+            logging.info(f"Querying Brave Search with query: `{q}`")
+            res = await query_brave_search(pw_browser, q, params.n_results)
+            results.extend(res)
+            continue
+        except Exception as e:
+            logging.error(
+                f"Failed to query Brave Search with query `{q}`: {e}")
+        if len(results) == 0:
+            return APISearchResults(results=[], error="All backends are rate-limited.")
+    return APISearchResults(results=results, error=None)
 uvicorn.run(app, host="0.0.0.0", port=7860)

backends.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from contextlib import asynccontextmanager
+from typing import Optional
+from duckduckgo_search import DDGS
+from pydantic import BaseModel
+from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
+from urllib.parse import quote_plus
+import logging
+import re
+class APIPatentResults(BaseModel):
+    """Response of /search_patents endpoint"""
+    error: Optional[str]
+    results: Optional[list[dict]]
+class APISearchResults(BaseModel):
+    error: Optional[str]
+    results: Optional[list[dict]]
+class BraveSearchBlockedException(Exception):
+    """Dummy exception to detect when the headless browser is flagged as suspicious."""
+    pass
+@asynccontextmanager
+async def playwright_open_page(browser: Browser):
+    """Context manager for playwright pages"""
+    context: BrowserContext = await browser.new_context()
+    page: Page = await context.new_page()
+    try:
+        yield page
+    finally:
+        await page.close()
+        await context.close()
+async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
+    """Queries google patents for the specified query and number of results. Returns relevant patents"""
+    async with playwright_open_page(browser) as page:
+        async def _block_resources(route, request):
+            if request.resource_type in ["stylesheet", "image"]:
+                await route.abort()
+            else:
+                await route.continue_()
+        await page.route("**/*", _block_resources)
+        url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
+        await page.goto(url)
+        await page.wait_for_function(
+            f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
+            timeout=30_000
+        )
+        # regex to locate a patent id
+        PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
+        items = await page.locator("search-result-item").all()
+        id_matches = []
+        for item in items:
+            all_text = " ".join(await item.locator("span").all_inner_texts())
+            found = re.findall(PATENT_ID_REGEX, all_text)
+            if found:
+                id_matches.append(found[0])
+        patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
+                   for id in id_matches]
+    return patents[:n_results]
+async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
+    """Queries Brave Search for the specified query."""
+    async with playwright_open_page(browser) as page:
+        async def _block_resources(route, request):
+            if request.resource_type in ["stylesheet", "image"]:
+                await route.abort()
+            else:
+                await route.continue_()
+        await page.route("**/*", _block_resources)
+        url = f"https://search.brave.com/search?q={quote_plus(q)}"
+        await page.goto(url)
+        results_cards = await page.locator('.snippet').all()
+        if len(results_cards) == 0:
+            logging.warning(f"No results for query: {q}")
+            page_content = await page.content()
+            if "suspicious" in page_content:
+                logging.warning("Brave search flagged browser as suspicious.")
+                raise BraveSearchBlockedException()
+        results = []
+        try:
+            for result in results_cards:
+                title = await result.locator('.title').all_inner_texts()
+                description = await result.locator('.snippet-description').all_inner_texts()
+                url = await result.locator('a').nth(0).get_attribute('href')
+                # Filter out results with no URL or brave-specific URLs
+                if url is None or url.startswith('/'):
+                    continue
+                results.append({
+                    "title": title[0] if title else "",
+                    "body": description[0] if description else "",
+                    "href": url
+                })
+                if len(results) >= n_results:
+                    break
+        except TimeoutError as e:
+            logging.warning(
+                f"Timeout on selector while parsing Brave Search SERP: {e}")
+        return results
+async def query_ddg_search(q: str, n_results: int = 10):
+    """Queries duckduckgo search for the specified query"""
+    ddgs = DDGS()
+    results = []
+    for result in ddgs.text(q, max_results=n_results):
+        results.append(
+            {"title": result["title"], "body": result["body"], "href": result["href"]})
+    return results

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 fastapi
 uvicorn
 pydantic
-playwright

 fastapi
 uvicorn
 pydantic
+playwright
+duckduckgo_search
+expiringdict