Add search_duck + search route
Browse files- .gitignore +2 -0
- app.py +64 -93
- backends.py +140 -0
- requirements.txt +3 -1
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.vscode
|
app.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
from contextlib import asynccontextmanager
|
|
|
2 |
from typing import Optional
|
|
|
|
|
|
|
3 |
from fastapi import FastAPI
|
4 |
from pydantic import BaseModel, Field
|
5 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
@@ -8,11 +12,13 @@ import logging
|
|
8 |
import re
|
9 |
import uvicorn
|
10 |
|
|
|
|
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
|
13 |
# playwright global context
|
14 |
playwright = None
|
15 |
-
pw_browser: Browser = None
|
16 |
|
17 |
|
18 |
@asynccontextmanager
|
@@ -27,6 +33,7 @@ async def api_lifespan(app: FastAPI):
|
|
27 |
await playwright.stop()
|
28 |
|
29 |
app = FastAPI(lifespan=api_lifespan)
|
|
|
30 |
|
31 |
|
32 |
class APISearchParams(BaseModel):
|
@@ -36,95 +43,6 @@ class APISearchParams(BaseModel):
|
|
36 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
37 |
|
38 |
|
39 |
-
class APIPatentResults(BaseModel):
|
40 |
-
"""Response of /search_patents endpoint"""
|
41 |
-
error: Optional[str]
|
42 |
-
results: Optional[list[dict]]
|
43 |
-
|
44 |
-
|
45 |
-
class APIBraveResults(BaseModel):
|
46 |
-
"""Response of /search_brave endpoint"""
|
47 |
-
error: Optional[str]
|
48 |
-
results: Optional[list[dict]]
|
49 |
-
|
50 |
-
|
51 |
-
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
52 |
-
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
53 |
-
context: BrowserContext = await browser.new_context()
|
54 |
-
page: Page = await context.new_page()
|
55 |
-
|
56 |
-
async def _block_resources(route, request):
|
57 |
-
if request.resource_type in ["stylesheet", "image"]:
|
58 |
-
await route.abort()
|
59 |
-
else:
|
60 |
-
await route.continue_()
|
61 |
-
|
62 |
-
await page.route("**/*", _block_resources)
|
63 |
-
|
64 |
-
url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
|
65 |
-
await page.goto(url)
|
66 |
-
|
67 |
-
await page.wait_for_function(
|
68 |
-
f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
|
69 |
-
timeout=30_000
|
70 |
-
)
|
71 |
-
|
72 |
-
# regex to locate a patent id
|
73 |
-
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
74 |
-
|
75 |
-
items = await page.locator("search-result-item").all()
|
76 |
-
id_matches = []
|
77 |
-
for item in items:
|
78 |
-
all_text = " ".join(await item.locator("span").all_inner_texts())
|
79 |
-
found = re.findall(PATENT_ID_REGEX, all_text)
|
80 |
-
if found:
|
81 |
-
id_matches.append(found[0])
|
82 |
-
|
83 |
-
await context.close()
|
84 |
-
|
85 |
-
patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
|
86 |
-
for id in id_matches]
|
87 |
-
return patents[:n_results]
|
88 |
-
|
89 |
-
|
90 |
-
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|
91 |
-
"""Queries brave search for the specified query"""
|
92 |
-
context: BrowserContext = await browser.new_context()
|
93 |
-
page: Page = await context.new_page()
|
94 |
-
|
95 |
-
async def _block_resources(route, request):
|
96 |
-
if request.resource_type in ["stylesheet", "image"]:
|
97 |
-
await route.abort()
|
98 |
-
else:
|
99 |
-
await route.continue_()
|
100 |
-
|
101 |
-
await page.route("**/*", _block_resources)
|
102 |
-
|
103 |
-
url = f"https://search.brave.com/search?q={quote_plus(q)}"
|
104 |
-
await page.goto(url)
|
105 |
-
|
106 |
-
results_cards = await page.locator('.snippet').all()
|
107 |
-
|
108 |
-
if len(results_cards) == 0:
|
109 |
-
logging.warning(f"No results for query: {q}")
|
110 |
-
logging.warning(await page.content())
|
111 |
-
|
112 |
-
results = []
|
113 |
-
|
114 |
-
for result in results_cards:
|
115 |
-
title = await result.locator('.title').all_inner_texts()
|
116 |
-
description = await result.locator('.snippet-description').all_inner_texts()
|
117 |
-
url = await result.locator('a').nth(0).get_attribute('href')
|
118 |
-
|
119 |
-
if url.startswith('/'):
|
120 |
-
continue
|
121 |
-
|
122 |
-
results.append({"title": title[0] if len(title) > 0 else "", "body": description[0] if len(
|
123 |
-
description) > 0 else "", "href": url})
|
124 |
-
|
125 |
-
return results[:n_results]
|
126 |
-
|
127 |
-
|
128 |
@app.post("/search_scholar")
|
129 |
async def query_google_scholar(params: APISearchParams):
|
130 |
"""Queries google scholar for the specified query"""
|
@@ -133,7 +51,7 @@ async def query_google_scholar(params: APISearchParams):
|
|
133 |
|
134 |
@app.get('/')
|
135 |
async def status():
|
136 |
-
return {"status": "running"}
|
137 |
|
138 |
|
139 |
@app.post("/search_patents")
|
@@ -146,24 +64,77 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
|
|
146 |
res = await query_google_patents(pw_browser, q, params.n_results)
|
147 |
results.extend(res)
|
148 |
except Exception as e:
|
|
|
149 |
logging.error(
|
150 |
f"Failed to query Google Patents with query `{q}`: {e}")
|
151 |
return APIPatentResults(results=results, error=None)
|
152 |
|
153 |
|
154 |
@app.post("/search_brave")
|
155 |
-
async def search_brave(params: APISearchParams) ->
|
156 |
"""Searches brave search for the specified queries and returns the found documents."""
|
157 |
results = []
|
|
|
158 |
for q in params.queries:
|
159 |
logging.info(f"Searching Brave search with query `{q}`")
|
160 |
try:
|
161 |
res = await query_brave_search(pw_browser, q, params.n_results)
|
162 |
results.extend(res)
|
163 |
except Exception as e:
|
|
|
|
|
164 |
logging.error(
|
165 |
f"Failed to query Brave search with query `{q}`: {e}")
|
166 |
|
167 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
+
import json
|
3 |
from typing import Optional
|
4 |
+
from duckduckgo_search import DDGS
|
5 |
+
from duckduckgo_search.exceptions import RatelimitException
|
6 |
+
import expiringdict
|
7 |
from fastapi import FastAPI
|
8 |
from pydantic import BaseModel, Field
|
9 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
|
12 |
import re
|
13 |
import uvicorn
|
14 |
|
15 |
+
from backends import APISearchResults, APIPatentResults, query_brave_search, query_ddg_search, query_google_patents
|
16 |
+
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
|
19 |
# playwright global context
|
20 |
playwright = None
|
21 |
+
pw_browser: Optional[Browser] = None
|
22 |
|
23 |
|
24 |
@asynccontextmanager
|
|
|
33 |
await playwright.stop()
|
34 |
|
35 |
app = FastAPI(lifespan=api_lifespan)
|
36 |
+
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
37 |
|
38 |
|
39 |
class APISearchParams(BaseModel):
|
|
|
43 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
@app.post("/search_scholar")
|
47 |
async def query_google_scholar(params: APISearchParams):
|
48 |
"""Queries google scholar for the specified query"""
|
|
|
51 |
|
52 |
@app.get('/')
|
53 |
async def status():
|
54 |
+
return {"status": "running", "backend_status": backend_status.items_with_timestamp()}
|
55 |
|
56 |
|
57 |
@app.post("/search_patents")
|
|
|
64 |
res = await query_google_patents(pw_browser, q, params.n_results)
|
65 |
results.extend(res)
|
66 |
except Exception as e:
|
67 |
+
backend_status["gpatents"] = "rate-limited"
|
68 |
logging.error(
|
69 |
f"Failed to query Google Patents with query `{q}`: {e}")
|
70 |
return APIPatentResults(results=results, error=None)
|
71 |
|
72 |
|
73 |
@app.post("/search_brave")
|
74 |
+
async def search_brave(params: APISearchParams) -> APISearchResults:
|
75 |
"""Searches brave search for the specified queries and returns the found documents."""
|
76 |
results = []
|
77 |
+
last_exception: Optional[Exception] = None
|
78 |
for q in params.queries:
|
79 |
logging.info(f"Searching Brave search with query `{q}`")
|
80 |
try:
|
81 |
res = await query_brave_search(pw_browser, q, params.n_results)
|
82 |
results.extend(res)
|
83 |
except Exception as e:
|
84 |
+
last_exception = e
|
85 |
+
backend_status["brave"] = "rate-limited"
|
86 |
logging.error(
|
87 |
f"Failed to query Brave search with query `{q}`: {e}")
|
88 |
|
89 |
+
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
90 |
+
|
91 |
+
|
92 |
+
@app.post("/search_duck")
|
93 |
+
async def search_duck(params: APISearchParams) -> APISearchResults:
|
94 |
+
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
95 |
+
results = []
|
96 |
+
last_exception: Optional[Exception] = None
|
97 |
+
|
98 |
+
for q in params.queries:
|
99 |
+
logging.info(f"Querying DDG with query: `{q}`")
|
100 |
+
try:
|
101 |
+
res = await query_ddg_search(q, params.n_results)
|
102 |
+
results.extend(res)
|
103 |
+
except Exception as e:
|
104 |
+
last_exception = e
|
105 |
+
backend_status["duckduckgo"] = "rate-limited"
|
106 |
+
logging.error(f"Failed to query DDG with query `{q}`: {e}")
|
107 |
+
|
108 |
+
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
109 |
+
|
110 |
+
|
111 |
+
@app.post("/search")
|
112 |
+
async def search(params: APISearchParams):
|
113 |
+
"""Attempts to search the specified queries using ALL backends"""
|
114 |
+
results = []
|
115 |
+
|
116 |
+
for q in params.queries:
|
117 |
+
try:
|
118 |
+
logging.info(f"Querying DDG with query: `{q}`")
|
119 |
+
res = await query_ddg_search(q, params.n_results)
|
120 |
+
results.extend(res)
|
121 |
+
continue
|
122 |
+
except Exception as e:
|
123 |
+
logging.error(f"Failed to query DDG with query `{q}`: {e}")
|
124 |
+
logging.info("Trying with next browser backend.")
|
125 |
+
|
126 |
+
try:
|
127 |
+
logging.info(f"Querying Brave Search with query: `{q}`")
|
128 |
+
res = await query_brave_search(pw_browser, q, params.n_results)
|
129 |
+
results.extend(res)
|
130 |
+
continue
|
131 |
+
except Exception as e:
|
132 |
+
logging.error(
|
133 |
+
f"Failed to query Brave Search with query `{q}`: {e}")
|
134 |
+
|
135 |
+
if len(results) == 0:
|
136 |
+
return APISearchResults(results=[], error="All backends are rate-limited.")
|
137 |
+
|
138 |
+
return APISearchResults(results=results, error=None)
|
139 |
|
140 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
backends.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import asynccontextmanager
|
2 |
+
from typing import Optional
|
3 |
+
from duckduckgo_search import DDGS
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
|
6 |
+
from urllib.parse import quote_plus
|
7 |
+
import logging
|
8 |
+
import re
|
9 |
+
|
10 |
+
|
11 |
+
class APIPatentResults(BaseModel):
|
12 |
+
"""Response of /search_patents endpoint"""
|
13 |
+
error: Optional[str]
|
14 |
+
results: Optional[list[dict]]
|
15 |
+
|
16 |
+
|
17 |
+
class APISearchResults(BaseModel):
|
18 |
+
error: Optional[str]
|
19 |
+
results: Optional[list[dict]]
|
20 |
+
|
21 |
+
|
22 |
+
class BraveSearchBlockedException(Exception):
|
23 |
+
"""Dummy exception to detect when the headless browser is flagged as suspicious."""
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
@asynccontextmanager
|
28 |
+
async def playwright_open_page(browser: Browser):
|
29 |
+
"""Context manager for playwright pages"""
|
30 |
+
context: BrowserContext = await browser.new_context()
|
31 |
+
page: Page = await context.new_page()
|
32 |
+
try:
|
33 |
+
yield page
|
34 |
+
finally:
|
35 |
+
await page.close()
|
36 |
+
await context.close()
|
37 |
+
|
38 |
+
|
39 |
+
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
40 |
+
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
41 |
+
|
42 |
+
async with playwright_open_page(browser) as page:
|
43 |
+
|
44 |
+
async def _block_resources(route, request):
|
45 |
+
if request.resource_type in ["stylesheet", "image"]:
|
46 |
+
await route.abort()
|
47 |
+
else:
|
48 |
+
await route.continue_()
|
49 |
+
|
50 |
+
await page.route("**/*", _block_resources)
|
51 |
+
|
52 |
+
url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
|
53 |
+
await page.goto(url)
|
54 |
+
|
55 |
+
await page.wait_for_function(
|
56 |
+
f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
|
57 |
+
timeout=30_000
|
58 |
+
)
|
59 |
+
|
60 |
+
# regex to locate a patent id
|
61 |
+
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
62 |
+
|
63 |
+
items = await page.locator("search-result-item").all()
|
64 |
+
id_matches = []
|
65 |
+
for item in items:
|
66 |
+
all_text = " ".join(await item.locator("span").all_inner_texts())
|
67 |
+
found = re.findall(PATENT_ID_REGEX, all_text)
|
68 |
+
if found:
|
69 |
+
id_matches.append(found[0])
|
70 |
+
|
71 |
+
patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
|
72 |
+
for id in id_matches]
|
73 |
+
|
74 |
+
return patents[:n_results]
|
75 |
+
|
76 |
+
|
77 |
+
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|
78 |
+
"""Queries Brave Search for the specified query."""
|
79 |
+
|
80 |
+
async with playwright_open_page(browser) as page:
|
81 |
+
|
82 |
+
async def _block_resources(route, request):
|
83 |
+
if request.resource_type in ["stylesheet", "image"]:
|
84 |
+
await route.abort()
|
85 |
+
else:
|
86 |
+
await route.continue_()
|
87 |
+
|
88 |
+
await page.route("**/*", _block_resources)
|
89 |
+
|
90 |
+
url = f"https://search.brave.com/search?q={quote_plus(q)}"
|
91 |
+
await page.goto(url)
|
92 |
+
|
93 |
+
results_cards = await page.locator('.snippet').all()
|
94 |
+
|
95 |
+
if len(results_cards) == 0:
|
96 |
+
logging.warning(f"No results for query: {q}")
|
97 |
+
page_content = await page.content()
|
98 |
+
|
99 |
+
if "suspicious" in page_content:
|
100 |
+
logging.warning("Brave search flagged browser as suspicious.")
|
101 |
+
raise BraveSearchBlockedException()
|
102 |
+
|
103 |
+
results = []
|
104 |
+
|
105 |
+
try:
|
106 |
+
for result in results_cards:
|
107 |
+
title = await result.locator('.title').all_inner_texts()
|
108 |
+
description = await result.locator('.snippet-description').all_inner_texts()
|
109 |
+
url = await result.locator('a').nth(0).get_attribute('href')
|
110 |
+
|
111 |
+
# Filter out results with no URL or brave-specific URLs
|
112 |
+
if url is None or url.startswith('/'):
|
113 |
+
continue
|
114 |
+
|
115 |
+
results.append({
|
116 |
+
"title": title[0] if title else "",
|
117 |
+
"body": description[0] if description else "",
|
118 |
+
"href": url
|
119 |
+
})
|
120 |
+
|
121 |
+
if len(results) >= n_results:
|
122 |
+
break
|
123 |
+
|
124 |
+
except TimeoutError as e:
|
125 |
+
logging.warning(
|
126 |
+
f"Timeout on selector while parsing Brave Search SERP: {e}")
|
127 |
+
|
128 |
+
return results
|
129 |
+
|
130 |
+
|
131 |
+
async def query_ddg_search(q: str, n_results: int = 10):
|
132 |
+
"""Queries duckduckgo search for the specified query"""
|
133 |
+
ddgs = DDGS()
|
134 |
+
results = []
|
135 |
+
|
136 |
+
for result in ddgs.text(q, max_results=n_results):
|
137 |
+
results.append(
|
138 |
+
{"title": result["title"], "body": result["body"], "href": result["href"]})
|
139 |
+
|
140 |
+
return results
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
fastapi
|
2 |
uvicorn
|
3 |
pydantic
|
4 |
-
playwright
|
|
|
|
|
|
1 |
fastapi
|
2 |
uvicorn
|
3 |
pydantic
|
4 |
+
playwright
|
5 |
+
duckduckgo_search
|
6 |
+
expiringdict
|