Game4all commited on
Commit
3beb07e
·
1 Parent(s): 551703a

Add search_duck + search route

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +64 -93
  3. backends.py +140 -0
  4. requirements.txt +3 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .vscode
app.py CHANGED
@@ -1,5 +1,9 @@
1
  from contextlib import asynccontextmanager
 
2
  from typing import Optional
 
 
 
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel, Field
5
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
@@ -8,11 +12,13 @@ import logging
8
  import re
9
  import uvicorn
10
 
 
 
11
  logging.basicConfig(level=logging.INFO)
12
 
13
  # playwright global context
14
  playwright = None
15
- pw_browser: Browser = None
16
 
17
 
18
  @asynccontextmanager
@@ -27,6 +33,7 @@ async def api_lifespan(app: FastAPI):
27
  await playwright.stop()
28
 
29
  app = FastAPI(lifespan=api_lifespan)
 
30
 
31
 
32
  class APISearchParams(BaseModel):
@@ -36,95 +43,6 @@ class APISearchParams(BaseModel):
36
  10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
37
 
38
 
39
- class APIPatentResults(BaseModel):
40
- """Response of /search_patents endpoint"""
41
- error: Optional[str]
42
- results: Optional[list[dict]]
43
-
44
-
45
- class APIBraveResults(BaseModel):
46
- """Response of /search_brave endpoint"""
47
- error: Optional[str]
48
- results: Optional[list[dict]]
49
-
50
-
51
- async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
52
- """Queries google patents for the specified query and number of results. Returns relevant patents"""
53
- context: BrowserContext = await browser.new_context()
54
- page: Page = await context.new_page()
55
-
56
- async def _block_resources(route, request):
57
- if request.resource_type in ["stylesheet", "image"]:
58
- await route.abort()
59
- else:
60
- await route.continue_()
61
-
62
- await page.route("**/*", _block_resources)
63
-
64
- url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
65
- await page.goto(url)
66
-
67
- await page.wait_for_function(
68
- f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
69
- timeout=30_000
70
- )
71
-
72
- # regex to locate a patent id
73
- PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
74
-
75
- items = await page.locator("search-result-item").all()
76
- id_matches = []
77
- for item in items:
78
- all_text = " ".join(await item.locator("span").all_inner_texts())
79
- found = re.findall(PATENT_ID_REGEX, all_text)
80
- if found:
81
- id_matches.append(found[0])
82
-
83
- await context.close()
84
-
85
- patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
86
- for id in id_matches]
87
- return patents[:n_results]
88
-
89
-
90
- async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
91
- """Queries brave search for the specified query"""
92
- context: BrowserContext = await browser.new_context()
93
- page: Page = await context.new_page()
94
-
95
- async def _block_resources(route, request):
96
- if request.resource_type in ["stylesheet", "image"]:
97
- await route.abort()
98
- else:
99
- await route.continue_()
100
-
101
- await page.route("**/*", _block_resources)
102
-
103
- url = f"https://search.brave.com/search?q={quote_plus(q)}"
104
- await page.goto(url)
105
-
106
- results_cards = await page.locator('.snippet').all()
107
-
108
- if len(results_cards) == 0:
109
- logging.warning(f"No results for query: {q}")
110
- logging.warning(await page.content())
111
-
112
- results = []
113
-
114
- for result in results_cards:
115
- title = await result.locator('.title').all_inner_texts()
116
- description = await result.locator('.snippet-description').all_inner_texts()
117
- url = await result.locator('a').nth(0).get_attribute('href')
118
-
119
- if url.startswith('/'):
120
- continue
121
-
122
- results.append({"title": title[0] if len(title) > 0 else "", "body": description[0] if len(
123
- description) > 0 else "", "href": url})
124
-
125
- return results[:n_results]
126
-
127
-
128
  @app.post("/search_scholar")
129
  async def query_google_scholar(params: APISearchParams):
130
  """Queries google scholar for the specified query"""
@@ -133,7 +51,7 @@ async def query_google_scholar(params: APISearchParams):
133
 
134
  @app.get('/')
135
  async def status():
136
- return {"status": "running"}
137
 
138
 
139
  @app.post("/search_patents")
@@ -146,24 +64,77 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
146
  res = await query_google_patents(pw_browser, q, params.n_results)
147
  results.extend(res)
148
  except Exception as e:
 
149
  logging.error(
150
  f"Failed to query Google Patents with query `{q}`: {e}")
151
  return APIPatentResults(results=results, error=None)
152
 
153
 
154
  @app.post("/search_brave")
155
- async def search_brave(params: APISearchParams) -> APIBraveResults:
156
  """Searches brave search for the specified queries and returns the found documents."""
157
  results = []
 
158
  for q in params.queries:
159
  logging.info(f"Searching Brave search with query `{q}`")
160
  try:
161
  res = await query_brave_search(pw_browser, q, params.n_results)
162
  results.extend(res)
163
  except Exception as e:
 
 
164
  logging.error(
165
  f"Failed to query Brave search with query `{q}`: {e}")
166
 
167
- return APIBraveResults(results=results, error=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  from contextlib import asynccontextmanager
2
+ import json
3
  from typing import Optional
4
+ from duckduckgo_search import DDGS
5
+ from duckduckgo_search.exceptions import RatelimitException
6
+ import expiringdict
7
  from fastapi import FastAPI
8
  from pydantic import BaseModel, Field
9
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 
12
  import re
13
  import uvicorn
14
 
15
+ from backends import APISearchResults, APIPatentResults, query_brave_search, query_ddg_search, query_google_patents
16
+
17
  logging.basicConfig(level=logging.INFO)
18
 
19
  # playwright global context
20
  playwright = None
21
+ pw_browser: Optional[Browser] = None
22
 
23
 
24
  @asynccontextmanager
 
33
  await playwright.stop()
34
 
35
  app = FastAPI(lifespan=api_lifespan)
36
+ backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
37
 
38
 
39
  class APISearchParams(BaseModel):
 
43
  10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  @app.post("/search_scholar")
47
  async def query_google_scholar(params: APISearchParams):
48
  """Queries google scholar for the specified query"""
 
51
 
52
  @app.get('/')
53
  async def status():
54
+ return {"status": "running", "backend_status": backend_status.items_with_timestamp()}
55
 
56
 
57
  @app.post("/search_patents")
 
64
  res = await query_google_patents(pw_browser, q, params.n_results)
65
  results.extend(res)
66
  except Exception as e:
67
+ backend_status["gpatents"] = "rate-limited"
68
  logging.error(
69
  f"Failed to query Google Patents with query `{q}`: {e}")
70
  return APIPatentResults(results=results, error=None)
71
 
72
 
73
  @app.post("/search_brave")
74
+ async def search_brave(params: APISearchParams) -> APISearchResults:
75
  """Searches brave search for the specified queries and returns the found documents."""
76
  results = []
77
+ last_exception: Optional[Exception] = None
78
  for q in params.queries:
79
  logging.info(f"Searching Brave search with query `{q}`")
80
  try:
81
  res = await query_brave_search(pw_browser, q, params.n_results)
82
  results.extend(res)
83
  except Exception as e:
84
+ last_exception = e
85
+ backend_status["brave"] = "rate-limited"
86
  logging.error(
87
  f"Failed to query Brave search with query `{q}`: {e}")
88
 
89
+ return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
90
+
91
+
92
+ @app.post("/search_duck")
93
+ async def search_duck(params: APISearchParams) -> APISearchResults:
94
+ """Searches duckduckgo for the specified queries and returns the found documents"""
95
+ results = []
96
+ last_exception: Optional[Exception] = None
97
+
98
+ for q in params.queries:
99
+ logging.info(f"Querying DDG with query: `{q}`")
100
+ try:
101
+ res = await query_ddg_search(q, params.n_results)
102
+ results.extend(res)
103
+ except Exception as e:
104
+ last_exception = e
105
+ backend_status["duckduckgo"] = "rate-limited"
106
+ logging.error(f"Failed to query DDG with query `{q}`: {e}")
107
+
108
+ return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
109
+
110
+
111
+ @app.post("/search")
112
+ async def search(params: APISearchParams):
113
+ """Attempts to search the specified queries using ALL backends"""
114
+ results = []
115
+
116
+ for q in params.queries:
117
+ try:
118
+ logging.info(f"Querying DDG with query: `{q}`")
119
+ res = await query_ddg_search(q, params.n_results)
120
+ results.extend(res)
121
+ continue
122
+ except Exception as e:
123
+ logging.error(f"Failed to query DDG with query `{q}`: {e}")
124
+ logging.info("Trying with next browser backend.")
125
+
126
+ try:
127
+ logging.info(f"Querying Brave Search with query: `{q}`")
128
+ res = await query_brave_search(pw_browser, q, params.n_results)
129
+ results.extend(res)
130
+ continue
131
+ except Exception as e:
132
+ logging.error(
133
+ f"Failed to query Brave Search with query `{q}`: {e}")
134
+
135
+ if len(results) == 0:
136
+ return APISearchResults(results=[], error="All backends are rate-limited.")
137
+
138
+ return APISearchResults(results=results, error=None)
139
 
140
  uvicorn.run(app, host="0.0.0.0", port=7860)
backends.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from typing import Optional
3
+ from duckduckgo_search import DDGS
4
+ from pydantic import BaseModel
5
+ from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
6
+ from urllib.parse import quote_plus
7
+ import logging
8
+ import re
9
+
10
+
11
+ class APIPatentResults(BaseModel):
12
+ """Response of /search_patents endpoint"""
13
+ error: Optional[str]
14
+ results: Optional[list[dict]]
15
+
16
+
17
+ class APISearchResults(BaseModel):
18
+ error: Optional[str]
19
+ results: Optional[list[dict]]
20
+
21
+
22
+ class BraveSearchBlockedException(Exception):
23
+ """Dummy exception to detect when the headless browser is flagged as suspicious."""
24
+ pass
25
+
26
+
27
+ @asynccontextmanager
28
+ async def playwright_open_page(browser: Browser):
29
+ """Context manager for playwright pages"""
30
+ context: BrowserContext = await browser.new_context()
31
+ page: Page = await context.new_page()
32
+ try:
33
+ yield page
34
+ finally:
35
+ await page.close()
36
+ await context.close()
37
+
38
+
39
+ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
40
+ """Queries google patents for the specified query and number of results. Returns relevant patents"""
41
+
42
+ async with playwright_open_page(browser) as page:
43
+
44
+ async def _block_resources(route, request):
45
+ if request.resource_type in ["stylesheet", "image"]:
46
+ await route.abort()
47
+ else:
48
+ await route.continue_()
49
+
50
+ await page.route("**/*", _block_resources)
51
+
52
+ url = f"https://patents.google.com/?q=({quote_plus(q)})&oq={quote_plus(q)}&num={n_results}"
53
+ await page.goto(url)
54
+
55
+ await page.wait_for_function(
56
+ f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""",
57
+ timeout=30_000
58
+ )
59
+
60
+ # regex to locate a patent id
61
+ PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
62
+
63
+ items = await page.locator("search-result-item").all()
64
+ id_matches = []
65
+ for item in items:
66
+ all_text = " ".join(await item.locator("span").all_inner_texts())
67
+ found = re.findall(PATENT_ID_REGEX, all_text)
68
+ if found:
69
+ id_matches.append(found[0])
70
+
71
+ patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
72
+ for id in id_matches]
73
+
74
+ return patents[:n_results]
75
+
76
+
77
+ async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
78
+ """Queries Brave Search for the specified query."""
79
+
80
+ async with playwright_open_page(browser) as page:
81
+
82
+ async def _block_resources(route, request):
83
+ if request.resource_type in ["stylesheet", "image"]:
84
+ await route.abort()
85
+ else:
86
+ await route.continue_()
87
+
88
+ await page.route("**/*", _block_resources)
89
+
90
+ url = f"https://search.brave.com/search?q={quote_plus(q)}"
91
+ await page.goto(url)
92
+
93
+ results_cards = await page.locator('.snippet').all()
94
+
95
+ if len(results_cards) == 0:
96
+ logging.warning(f"No results for query: {q}")
97
+ page_content = await page.content()
98
+
99
+ if "suspicious" in page_content:
100
+ logging.warning("Brave search flagged browser as suspicious.")
101
+ raise BraveSearchBlockedException()
102
+
103
+ results = []
104
+
105
+ try:
106
+ for result in results_cards:
107
+ title = await result.locator('.title').all_inner_texts()
108
+ description = await result.locator('.snippet-description').all_inner_texts()
109
+ url = await result.locator('a').nth(0).get_attribute('href')
110
+
111
+ # Filter out results with no URL or brave-specific URLs
112
+ if url is None or url.startswith('/'):
113
+ continue
114
+
115
+ results.append({
116
+ "title": title[0] if title else "",
117
+ "body": description[0] if description else "",
118
+ "href": url
119
+ })
120
+
121
+ if len(results) >= n_results:
122
+ break
123
+
124
+ except TimeoutError as e:
125
+ logging.warning(
126
+ f"Timeout on selector while parsing Brave Search SERP: {e}")
127
+
128
+ return results
129
+
130
+
131
+ async def query_ddg_search(q: str, n_results: int = 10):
132
+ """Queries duckduckgo search for the specified query"""
133
+ ddgs = DDGS()
134
+ results = []
135
+
136
+ for result in ddgs.text(q, max_results=n_results):
137
+ results.append(
138
+ {"title": result["title"], "body": result["body"], "href": result["href"]})
139
+
140
+ return results
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  fastapi
2
  uvicorn
3
  pydantic
4
- playwright
 
 
 
1
  fastapi
2
  uvicorn
3
  pydantic
4
+ playwright
5
+ duckduckgo_search
6
+ expiringdict