github-actions[bot] commited on
Commit
e25b548
·
1 Parent(s): fe63966

Auto-sync from demo at Mon Nov 10 12:33:00 UTC 2025

Browse files
graphgen/bases/__init__.py CHANGED
@@ -4,6 +4,7 @@ from .base_kg_builder import BaseKGBuilder
4
  from .base_llm_wrapper import BaseLLMWrapper
5
  from .base_partitioner import BasePartitioner
6
  from .base_reader import BaseReader
 
7
  from .base_splitter import BaseSplitter
8
  from .base_storage import (
9
  BaseGraphStorage,
 
4
  from .base_llm_wrapper import BaseLLMWrapper
5
  from .base_partitioner import BasePartitioner
6
  from .base_reader import BaseReader
7
+ from .base_searcher import BaseSearcher
8
  from .base_splitter import BaseSplitter
9
  from .base_storage import (
10
  BaseGraphStorage,
graphgen/bases/base_searcher.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List
3
+
4
+
5
+ class BaseSearcher(ABC):
6
+ """
7
+ Abstract base class for searching and retrieving data.
8
+ """
9
+
10
+ @abstractmethod
11
+ async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]:
12
+ """
13
+ Search for data based on the given query.
14
+
15
+ :param query: The searcher query.
16
+ :param kwargs: Additional keyword arguments for the searcher.
17
+ :return: List of dictionaries containing the searcher results.
18
+ """
graphgen/configs/search_config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pipeline:
2
+ - name: read
3
+ params:
4
+ input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: search
7
+ params:
8
+ data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
graphgen/graphgen.py CHANGED
@@ -58,7 +58,6 @@ class GraphGen:
58
  self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
59
  self.working_dir, namespace="_meta"
60
  )
61
-
62
  self.full_docs_storage: JsonKVStorage = JsonKVStorage(
63
  self.working_dir, namespace="full_docs"
64
  )
@@ -69,9 +68,8 @@ class GraphGen:
69
  self.working_dir, namespace="graph"
70
  )
71
  self.search_storage: JsonKVStorage = JsonKVStorage(
72
- self.working_dir, namespace="search"
73
  )
74
-
75
  self.rephrase_storage: JsonKVStorage = JsonKVStorage(
76
  self.working_dir, namespace="rephrase"
77
  )
@@ -181,41 +179,33 @@ class GraphGen:
181
 
182
  return _add_entities_and_relations
183
 
184
- @op("search", deps=["chunk"])
185
  @async_to_sync_method
186
  async def search(self, search_config: Dict):
187
- logger.info(
188
- "Search is %s", "enabled" if search_config["enabled"] else "disabled"
 
 
 
 
 
 
 
 
 
 
 
189
  )
190
- if search_config["enabled"]:
191
- logger.info("[Search] %s ...", ", ".join(search_config["search_types"]))
192
- all_nodes = await self.graph_storage.get_all_nodes()
193
- all_nodes_names = [node[0] for node in all_nodes]
194
- new_search_entities = await self.full_docs_storage.filter_keys(
195
- all_nodes_names
196
- )
197
- logger.info(
198
- "[Search] Found %d entities to search", len(new_search_entities)
199
- )
200
- _add_search_data = await search_all(
201
- search_types=search_config["search_types"],
202
- search_entities=new_search_entities,
203
- )
204
- if _add_search_data:
205
- await self.search_storage.upsert(_add_search_data)
206
- logger.info("[Search] %d entities searched", len(_add_search_data))
207
-
208
- # Format search results for inserting
209
- search_results = []
210
- for _, search_data in _add_search_data.items():
211
- search_results.extend(
212
- [
213
- {"content": search_data[key]}
214
- for key in list(search_data.keys())
215
- ]
216
- )
217
- # TODO: fix insert after search
218
- # await self.insert()
219
 
220
  @op("quiz_and_judge", deps=["build_kg"])
221
  @async_to_sync_method
 
58
  self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
59
  self.working_dir, namespace="_meta"
60
  )
 
61
  self.full_docs_storage: JsonKVStorage = JsonKVStorage(
62
  self.working_dir, namespace="full_docs"
63
  )
 
68
  self.working_dir, namespace="graph"
69
  )
70
  self.search_storage: JsonKVStorage = JsonKVStorage(
71
+ self.working_dir, namespace="searcher"
72
  )
 
73
  self.rephrase_storage: JsonKVStorage = JsonKVStorage(
74
  self.working_dir, namespace="rephrase"
75
  )
 
179
 
180
  return _add_entities_and_relations
181
 
182
+ @op("search", deps=["read"])
183
  @async_to_sync_method
184
  async def search(self, search_config: Dict):
185
+ logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
186
+
187
+ seeds = await self.meta_storage.get_new_data(self.full_docs_storage)
188
+ if len(seeds) == 0:
189
+ logger.warning("All documents are already been searched")
190
+ return
191
+ search_results = await search_all(
192
+ seed_data=seeds,
193
+ **search_config,
194
+ )
195
+
196
+ _add_search_keys = await self.search_storage.filter_keys(
197
+ list(search_results.keys())
198
  )
199
+ search_results = {
200
+ k: v for k, v in search_results.items() if k in _add_search_keys
201
+ }
202
+ if len(search_results) == 0:
203
+ logger.warning("All search results are already in the storage")
204
+ return
205
+ await self.search_storage.upsert(search_results)
206
+ await self.search_storage.index_done_callback()
207
+ await self.meta_storage.mark_done(self.full_docs_storage)
208
+ await self.meta_storage.index_done_callback()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  @op("quiz_and_judge", deps=["build_kg"])
211
  @async_to_sync_method
graphgen/models/__init__.py CHANGED
@@ -25,10 +25,10 @@ from .reader import (
25
  RDFReader,
26
  TXTReader,
27
  )
28
- from .search.db.uniprot_search import UniProtSearch
29
- from .search.kg.wiki_search import WikiSearch
30
- from .search.web.bing_search import BingSearch
31
- from .search.web.google_search import GoogleSearch
32
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
33
  from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage
34
  from .tokenizer import Tokenizer
 
25
  RDFReader,
26
  TXTReader,
27
  )
28
+ from .searcher.db.uniprot_searcher import UniProtSearch
29
+ from .searcher.kg.wiki_search import WikiSearch
30
+ from .searcher.web.bing_search import BingSearch
31
+ from .searcher.web.google_search import GoogleSearch
32
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
33
  from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage
34
  from .tokenizer import Tokenizer
graphgen/models/{search → searcher}/__init__.py RENAMED
File without changes
graphgen/models/{search → searcher}/db/__init__.py RENAMED
File without changes
graphgen/models/{search/db/uniprot_search.py → searcher/db/uniprot_searcher.py} RENAMED
@@ -1,18 +1,27 @@
 
1
  from io import StringIO
2
  from typing import Dict, Optional
3
 
4
  from Bio import ExPASy, SeqIO, SwissProt, UniProt
5
  from Bio.Blast import NCBIWWW, NCBIXML
6
-
 
 
 
 
 
 
 
 
7
  from graphgen.utils import logger
8
 
9
 
10
- class UniProtSearch:
11
  """
12
- UniProt Search client to search with UniProt.
13
  1) Get the protein by accession number.
14
- 2) Search with keywords or protein names (fuzzy search).
15
- 3) Search with FASTA sequence (BLAST search).
16
  """
17
 
18
  def get_by_accession(self, accession: str) -> Optional[dict]:
@@ -21,6 +30,8 @@ class UniProtSearch:
21
  record = SwissProt.read(handle)
22
  handle.close()
23
  return self._swissprot_to_dict(record)
 
 
24
  except Exception as exc: # pylint: disable=broad-except
25
  logger.error("Accession %s not found: %s", accession, exc)
26
  return None
@@ -51,7 +62,7 @@ class UniProtSearch:
51
  def get_best_hit(self, keyword: str) -> Optional[Dict]:
52
  """
53
  Search UniProt with a keyword and return the best hit.
54
- :param keyword: The search keyword.
55
  :return: A dictionary containing the best hit information or None if not found.
56
  """
57
  if not keyword.strip():
@@ -64,15 +75,17 @@ class UniProtSearch:
64
  return None
65
  return self.get_by_accession(hit["primaryAccession"])
66
 
 
 
67
  except Exception as e: # pylint: disable=broad-except
68
  logger.error("Keyword %s not found: %s", keyword, e)
69
- return None
70
 
71
  def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
72
  """
73
  Search UniProt with a FASTA sequence and return the best hit.
74
  :param fasta_sequence: The FASTA sequence.
75
- :param threshold: E-value threshold for BLAST search.
76
  :return: A dictionary containing the best hit information or None if not found.
77
  """
78
  try:
@@ -90,6 +103,7 @@ class UniProtSearch:
90
 
91
  # UniProtKB/Swiss-Prot BLAST API
92
  try:
 
93
  result_handle = NCBIWWW.qblast(
94
  program="blastp",
95
  database="swissprot",
@@ -98,8 +112,10 @@ class UniProtSearch:
98
  expect=threshold,
99
  )
100
  blast_record = NCBIXML.read(result_handle)
 
 
101
  except Exception as e: # pylint: disable=broad-except
102
- logger.error("BLAST search failed: %s", e)
103
  return None
104
 
105
  if not blast_record.alignments:
@@ -116,3 +132,44 @@ class UniProtSearch:
116
  # like sp|P01308.1|INS_HUMAN
117
  accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
118
  return self.get_by_accession(accession)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
  from io import StringIO
3
  from typing import Dict, Optional
4
 
5
  from Bio import ExPASy, SeqIO, SwissProt, UniProt
6
  from Bio.Blast import NCBIWWW, NCBIXML
7
+ from requests.exceptions import RequestException
8
+ from tenacity import (
9
+ retry,
10
+ retry_if_exception_type,
11
+ stop_after_attempt,
12
+ wait_exponential,
13
+ )
14
+
15
+ from graphgen.bases import BaseSearcher
16
  from graphgen.utils import logger
17
 
18
 
19
+ class UniProtSearch(BaseSearcher):
20
  """
21
+ UniProt Search client to searcher with UniProt.
22
  1) Get the protein by accession number.
23
+ 2) Search with keywords or protein names (fuzzy searcher).
24
+ 3) Search with FASTA sequence (BLAST searcher).
25
  """
26
 
27
  def get_by_accession(self, accession: str) -> Optional[dict]:
 
30
  record = SwissProt.read(handle)
31
  handle.close()
32
  return self._swissprot_to_dict(record)
33
+ except RequestException: # network-related errors
34
+ raise
35
  except Exception as exc: # pylint: disable=broad-except
36
  logger.error("Accession %s not found: %s", accession, exc)
37
  return None
 
62
  def get_best_hit(self, keyword: str) -> Optional[Dict]:
63
  """
64
  Search UniProt with a keyword and return the best hit.
65
+ :param keyword: The searcher keyword.
66
  :return: A dictionary containing the best hit information or None if not found.
67
  """
68
  if not keyword.strip():
 
75
  return None
76
  return self.get_by_accession(hit["primaryAccession"])
77
 
78
+ except RequestException:
79
+ raise
80
  except Exception as e: # pylint: disable=broad-except
81
  logger.error("Keyword %s not found: %s", keyword, e)
82
+ return None
83
 
84
  def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
85
  """
86
  Search UniProt with a FASTA sequence and return the best hit.
87
  :param fasta_sequence: The FASTA sequence.
88
+ :param threshold: E-value threshold for BLAST searcher.
89
  :return: A dictionary containing the best hit information or None if not found.
90
  """
91
  try:
 
103
 
104
  # UniProtKB/Swiss-Prot BLAST API
105
  try:
106
+ logger.debug("Performing BLAST searcher for the given sequence: %s", seq)
107
  result_handle = NCBIWWW.qblast(
108
  program="blastp",
109
  database="swissprot",
 
112
  expect=threshold,
113
  )
114
  blast_record = NCBIXML.read(result_handle)
115
+ except RequestException:
116
+ raise
117
  except Exception as e: # pylint: disable=broad-except
118
+ logger.error("BLAST searcher failed: %s", e)
119
  return None
120
 
121
  if not blast_record.alignments:
 
132
  # like sp|P01308.1|INS_HUMAN
133
  accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
134
  return self.get_by_accession(accession)
135
+
136
+ @retry(
137
+ stop=stop_after_attempt(5),
138
+ wait=wait_exponential(multiplier=1, min=4, max=10),
139
+ retry=retry_if_exception_type(RequestException),
140
+ reraise=True,
141
+ )
142
+ async def search(
143
+ self, query: str, threshold: float = 0.7, **kwargs
144
+ ) -> Optional[Dict]:
145
+ """
146
+ Search UniProt with either an accession number, keyword, or FASTA sequence.
147
+ :param query: The searcher query (accession number, keyword, or FASTA sequence).
148
+ :param threshold: E-value threshold for BLAST searcher.
149
+ :return: A dictionary containing the best hit information or None if not found.
150
+ """
151
+
152
+ # auto detect query type
153
+ if not query or not isinstance(query, str):
154
+ logger.error("Empty or non-string input.")
155
+ return None
156
+ query = query.strip()
157
+
158
+ logger.debug("UniProt searcher query: %s", query)
159
+ # check if fasta sequence
160
+ if query.startswith(">") or re.fullmatch(
161
+ r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
162
+ ):
163
+ result = self.get_by_fasta(query, threshold)
164
+
165
+ # check if accession number
166
+ elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I):
167
+ result = self.get_by_accession(query)
168
+
169
+ else:
170
+ # otherwise treat as keyword
171
+ result = self.get_best_hit(query)
172
+
173
+ if result:
174
+ result["_search_query"] = query
175
+ return result
graphgen/models/{search → searcher}/kg/__init__.py RENAMED
File without changes
graphgen/models/{search → searcher}/kg/wiki_search.py RENAMED
File without changes
graphgen/models/{search → searcher}/web/__init__.py RENAMED
File without changes
graphgen/models/{search → searcher}/web/bing_search.py RENAMED
@@ -9,7 +9,7 @@ BING_MKT = "en-US"
9
 
10
  class BingSearch:
11
  """
12
- Bing Search client to search with Bing.
13
  """
14
 
15
  def __init__(self, subscription_key: str):
@@ -18,9 +18,9 @@ class BingSearch:
18
  def search(self, query: str, num_results: int = 1):
19
  """
20
  Search with Bing and return the contexts.
21
- :param query: The search query.
22
  :param num_results: The number of results to return.
23
- :return: A list of search results.
24
  """
25
  params = {"q": query, "mkt": BING_MKT, "count": num_results}
26
  response = requests.get(
 
9
 
10
  class BingSearch:
11
  """
12
+ Bing Search client to searcher with Bing.
13
  """
14
 
15
  def __init__(self, subscription_key: str):
 
18
  def search(self, query: str, num_results: int = 1):
19
  """
20
  Search with Bing and return the contexts.
21
+ :param query: The searcher query.
22
  :param num_results: The number of results to return.
23
+ :return: A list of searcher results.
24
  """
25
  params = {"q": query, "mkt": BING_MKT, "count": num_results}
26
  response = requests.get(
graphgen/models/{search → searcher}/web/google_search.py RENAMED
@@ -9,9 +9,9 @@ GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
9
  class GoogleSearch:
10
  def __init__(self, subscription_key: str, cx: str):
11
  """
12
- Initialize the Google Search client with the subscription key and custom search engine ID.
13
  :param subscription_key: Your Google API subscription key.
14
- :param cx: Your custom search engine ID.
15
  """
16
  self.subscription_key = subscription_key
17
  self.cx = cx
@@ -19,9 +19,9 @@ class GoogleSearch:
19
  def search(self, query: str, num_results: int = 1):
20
  """
21
  Search with Google and return the contexts.
22
- :param query: The search query.
23
  :param num_results: The number of results to return.
24
- :return: A list of search results.
25
  """
26
  params = {
27
  "key": self.subscription_key,
 
9
  class GoogleSearch:
10
  def __init__(self, subscription_key: str, cx: str):
11
  """
12
+ Initialize the Google Search client with the subscription key and custom searcher engine ID.
13
  :param subscription_key: Your Google API subscription key.
14
+ :param cx: Your custom searcher engine ID.
15
  """
16
  self.subscription_key = subscription_key
17
  self.cx = cx
 
19
  def search(self, query: str, num_results: int = 1):
20
  """
21
  Search with Google and return the contexts.
22
+ :param query: The searcher query.
23
  :param num_results: The number of results to return.
24
+ :return: A list of searcher results.
25
  """
26
  params = {
27
  "key": self.subscription_key,
graphgen/operators/search/kg/__init__.py DELETED
File without changes
graphgen/operators/search/kg/search_wikipedia.py DELETED
@@ -1,58 +0,0 @@
1
- from tqdm.asyncio import tqdm_asyncio as tqdm_async
2
-
3
- from graphgen.models import WikiSearch
4
- from graphgen.utils import logger
5
-
6
-
7
- async def _process_single_entity(
8
- entity_name: str,
9
- wiki_search_client: WikiSearch,
10
- ) -> str | None:
11
- """
12
- Process single entity by searching Wikipedia
13
- :param entity_name
14
- :param wiki_search_client
15
- :return: summary of the entity or None if not found
16
- """
17
- search_results = await wiki_search_client.search(entity_name)
18
- if not search_results:
19
- return None
20
-
21
- summary = None
22
- try:
23
- summary = await wiki_search_client.summary(search_results[-1])
24
- logger.info(
25
- "Entity %s search result: %s summary: %s",
26
- entity_name,
27
- str(search_results),
28
- summary,
29
- )
30
- except Exception as e: # pylint: disable=broad-except
31
- logger.error("Error processing entity %s: %s", entity_name, str(e))
32
-
33
- return summary
34
-
35
-
36
- async def search_wikipedia(
37
- wiki_search_client: WikiSearch,
38
- entities: set[str],
39
- ) -> dict:
40
- """
41
- Search wikipedia for entities
42
-
43
- :param wiki_search_client: wiki search client
44
- :param entities: list of entities to search
45
- :return: nodes with search results
46
- """
47
- wiki_data = {}
48
-
49
- async for entity in tqdm_async(
50
- entities, desc="Searching Wikipedia", total=len(entities)
51
- ):
52
- try:
53
- summary = await _process_single_entity(entity, wiki_search_client)
54
- if summary:
55
- wiki_data[entity] = summary
56
- except Exception as e: # pylint: disable=broad-except
57
- logger.error("Error processing entity %s: %s", entity, str(e))
58
- return wiki_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
graphgen/operators/search/search_all.py CHANGED
@@ -1,82 +1,49 @@
1
  """
2
  To use Google Web Search API,
3
  follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
4
- to get your Google search api key.
5
 
6
  To use Bing Web Search API,
7
  follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
8
  and obtain your Bing subscription key.
9
  """
10
 
11
- import os
12
 
13
- from graphgen.utils import logger
14
 
15
 
16
  async def search_all(
17
- search_types: dict, search_entities: set[str]
18
- ) -> dict[str, dict[str, str]]:
 
19
  """
20
- :param search_types
21
- :param search_entities: list of entities to search
22
- :return: nodes with search results
 
23
  """
24
 
25
  results = {}
26
 
27
- for search_type in search_types:
28
- if search_type == "wikipedia":
29
- from graphgen.models import WikiSearch
30
- from graphgen.operators.search.kg.search_wikipedia import search_wikipedia
31
 
32
- wiki_search_client = WikiSearch()
33
 
34
- wiki_results = await search_wikipedia(wiki_search_client, search_entities)
35
- for entity_name, description in wiki_results.items():
36
- if description:
37
- results[entity_name] = {"wikipedia": description}
38
- elif search_type == "google":
39
- from graphgen.models import GoogleSearch
40
- from graphgen.operators.search.web.search_google import search_google
41
-
42
- google_search_client = GoogleSearch(
43
- subscription_key=os.environ["GOOGLE_SEARCH_API_KEY"],
44
- cx=os.environ["GOOGLE_SEARCH_CX"],
45
- )
46
-
47
- google_results = await search_google(google_search_client, search_entities)
48
- for entity_name, description in google_results.items():
49
- if description:
50
- results[entity_name] = results.get(entity_name, {})
51
- results[entity_name]["google"] = description
52
- elif search_type == "bing":
53
- from graphgen.models import BingSearch
54
- from graphgen.operators.search.web.search_bing import search_bing
55
-
56
- bing_search_client = BingSearch(
57
- subscription_key=os.environ["BING_SEARCH_API_KEY"]
58
  )
59
-
60
- bing_results = await search_bing(bing_search_client, search_entities)
61
- for entity_name, description in bing_results.items():
62
- if description:
63
- results[entity_name] = results.get(entity_name, {})
64
- results[entity_name]["bing"] = description
65
- elif search_type == "uniprot":
66
- # from graphgen.models import UniProtSearch
67
- # from graphgen.operators.search.db.search_uniprot import search_uniprot
68
- #
69
- # uniprot_search_client = UniProtSearch()
70
- #
71
- # uniprot_results = await search_uniprot(
72
- # uniprot_search_client, search_entities
73
- # )
74
- raise NotImplementedError(
75
- "Processing of UniProt search results is not implemented yet."
76
- )
77
-
78
  else:
79
- logger.error("Search type %s is not supported yet.", search_type)
80
  continue
81
 
 
 
82
  return results
 
1
  """
2
  To use Google Web Search API,
3
  follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
4
+ to get your Google searcher api key.
5
 
6
  To use Bing Web Search API,
7
  follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
8
  and obtain your Bing subscription key.
9
  """
10
 
 
11
 
12
+ from graphgen.utils import logger, run_concurrent
13
 
14
 
15
  async def search_all(
16
+ seed_data: dict,
17
+ data_sources: list[str],
18
+ ) -> dict:
19
  """
20
+ Perform searches across multiple search types and aggregate the results.
21
+ :param seed_data: A dictionary containing seed data with entity names.
22
+ :param data_sources: A list of search types to perform (e.g., "wikipedia", "google", "bing", "uniprot").
23
+ :return: A dictionary with
24
  """
25
 
26
  results = {}
27
 
28
+ for data_source in data_sources:
29
+ if data_source == "uniprot":
30
+ from graphgen.models import UniProtSearch
 
31
 
32
+ uniprot_search_client = UniProtSearch()
33
 
34
+ data = list(seed_data.values())
35
+ data = [d["content"] for d in data if "content" in d]
36
+ data = list(set(data)) # Remove duplicates
37
+ uniprot_results = await run_concurrent(
38
+ uniprot_search_client.search,
39
+ data,
40
+ desc="Searching UniProt database",
41
+ unit="keyword",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  else:
44
+ logger.error("Data source %s not supported.", data_source)
45
  continue
46
 
47
+ results[data_source] = uniprot_results
48
+
49
  return results
graphgen/operators/search/web/__init__.py DELETED
File without changes
graphgen/operators/search/web/search_bing.py DELETED
@@ -1,53 +0,0 @@
1
- import trafilatura
2
- from tqdm.asyncio import tqdm_asyncio as tqdm_async
3
-
4
- from graphgen.models import BingSearch
5
- from graphgen.utils import logger
6
-
7
-
8
- async def _process_single_entity(
9
- entity_name: str, bing_search_client: BingSearch
10
- ) -> str | None:
11
- """
12
- Process single entity by searching Bing.
13
- :param entity_name: The name of the entity to search.
14
- :param bing_search_client: The Bing search client.
15
- :return: Summary of the entity or None if not found.
16
- """
17
- search_results = bing_search_client.search(entity_name)
18
- if not search_results:
19
- return None
20
-
21
- # Get more details from the first search result
22
- first_result = search_results[0]
23
- content = trafilatura.fetch_url(first_result["url"])
24
- summary = trafilatura.extract(content, include_comments=False, include_links=False)
25
- summary = summary.strip()
26
- logger.info(
27
- "Entity %s search result: %s",
28
- entity_name,
29
- summary,
30
- )
31
- return summary
32
-
33
-
34
- async def search_bing(
35
- bing_search_client: BingSearch,
36
- entities: set[str],
37
- ) -> dict[str, str]:
38
- """
39
- Search with Bing and return the contexts.
40
- :return:
41
- """
42
- bing_data = {}
43
-
44
- async for entity in tqdm_async(
45
- entities, desc="Searching Bing", total=len(entities)
46
- ):
47
- try:
48
- summary = await _process_single_entity(entity, bing_search_client)
49
- if summary:
50
- bing_data[entity] = summary
51
- except Exception as e: # pylint: disable=broad-except
52
- logger.error("Error processing entity %s: %s", entity, str(e))
53
- return bing_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
graphgen/operators/search/web/search_google.py DELETED
@@ -1,49 +0,0 @@
1
- import trafilatura
2
- from tqdm.asyncio import tqdm_asyncio as tqdm_async
3
-
4
- from graphgen.models import GoogleSearch
5
- from graphgen.utils import logger
6
-
7
-
8
- async def _process_single_entity(
9
- entity_name: str, google_search_client: GoogleSearch
10
- ) -> str | None:
11
- search_results = google_search_client.search(entity_name)
12
- if not search_results:
13
- return None
14
-
15
- # Get more details from the first search result
16
- first_result = search_results[0]
17
- content = trafilatura.fetch_url(first_result["link"])
18
- summary = trafilatura.extract(content, include_comments=False, include_links=False)
19
- summary = summary.strip()
20
- logger.info(
21
- "Entity %s search result: %s",
22
- entity_name,
23
- summary,
24
- )
25
- return summary
26
-
27
-
28
- async def search_google(
29
- google_search_client: GoogleSearch,
30
- entities: set[str],
31
- ) -> dict:
32
- """
33
- Search with Google and return the contexts.
34
- :param google_search_client: Google search client
35
- :param entities: list of entities to search
36
- :return:
37
- """
38
- google_data = {}
39
-
40
- async for entity in tqdm_async(
41
- entities, desc="Searching Google", total=len(entities)
42
- ):
43
- try:
44
- summary = await _process_single_entity(entity, google_search_client)
45
- if summary:
46
- google_data[entity] = summary
47
- except Exception as e: # pylint: disable=broad-except
48
- logger.error("Error processing entity %s: %s", entity, str(e))
49
- return google_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
graphgen/templates/search_judgement.py CHANGED
@@ -1,16 +1,16 @@
1
  # pylint: disable=C0301
2
 
3
  TEMPLATE: str = """-Goal-
4
- Please select the most relevant search result for the given entity.
5
- The name and description of the entity are provided. The search results are provided as a list.
6
- Please select the most relevant search result from the list. If none of the search results are relevant, please select 'None of the above'.
7
 
8
  Steps:
9
  1. Read the name and description of the entity.
10
 
11
- 2. Read the search results. For each search result, compare it with the entity name and description to determine if it is relevant.
12
 
13
- 3. Select the most relevant search result from the list. If none of the search results are relevant, select 'None of the above'.
14
 
15
  4. Output your selection directly, please do not provide any additional information.
16
 
 
1
  # pylint: disable=C0301
2
 
3
  TEMPLATE: str = """-Goal-
4
+ Please select the most relevant searcher result for the given entity.
5
+ The name and description of the entity are provided. The searcher results are provided as a list.
6
+ Please select the most relevant searcher result from the list. If none of the searcher results are relevant, please select 'None of the above'.
7
 
8
  Steps:
9
  1. Read the name and description of the entity.
10
 
11
+ 2. Read the searcher results. For each searcher result, compare it with the entity name and description to determine if it is relevant.
12
 
13
+ 3. Select the most relevant searcher result from the list. If none of the searcher results are relevant, select 'None of the above'.
14
 
15
  4. Output your selection directly, please do not provide any additional information.
16
 
graphgen/utils/run_concurrent.py CHANGED
@@ -10,77 +10,6 @@ T = TypeVar("T")
10
  R = TypeVar("R")
11
 
12
 
13
- # async def run_concurrent(
14
- # coro_fn: Callable[[T], Awaitable[R]],
15
- # items: List[T],
16
- # *,
17
- # desc: str = "processing",
18
- # unit: str = "item",
19
- # progress_bar: Optional[gr.Progress] = None,
20
- # ) -> List[R]:
21
- # tasks = [asyncio.create_task(coro_fn(it)) for it in items]
22
- #
23
- # results = []
24
- # async for future in tqdm_async(
25
- # tasks, desc=desc, unit=unit
26
- # ):
27
- # try:
28
- # result = await future
29
- # results.append(result)
30
- # except Exception as e: # pylint: disable=broad-except
31
- # logger.exception("Task failed: %s", e)
32
- #
33
- # if progress_bar is not None:
34
- # progress_bar((len(results)) / len(items), desc=desc)
35
- #
36
- # if progress_bar is not None:
37
- # progress_bar(1.0, desc=desc)
38
- # return results
39
-
40
- # results = await tqdm_async.gather(*tasks, desc=desc, unit=unit)
41
- #
42
- # ok_results = []
43
- # for idx, res in enumerate(results):
44
- # if isinstance(res, Exception):
45
- # logger.exception("Task failed: %s", res)
46
- # if progress_bar:
47
- # progress_bar((idx + 1) / len(items), desc=desc)
48
- # continue
49
- # ok_results.append(res)
50
- # if progress_bar:
51
- # progress_bar((idx + 1) / len(items), desc=desc)
52
- #
53
- # if progress_bar:
54
- # progress_bar(1.0, desc=desc)
55
- # return ok_results
56
-
57
- # async def run_concurrent(
58
- # coro_fn: Callable[[T], Awaitable[R]],
59
- # items: List[T],
60
- # *,
61
- # desc: str = "processing",
62
- # unit: str = "item",
63
- # progress_bar: Optional[gr.Progress] = None,
64
- # ) -> List[R]:
65
- # tasks = [asyncio.create_task(coro_fn(it)) for it in items]
66
- #
67
- # results = []
68
- # # 使用同步方式更新进度条,避免异步冲突
69
- # for i, task in enumerate(asyncio.as_completed(tasks)):
70
- # try:
71
- # result = await task
72
- # results.append(result)
73
- # # 同步更新进度条
74
- # if progress_bar is not None:
75
- # # 在同步上下文中更新进度
76
- # progress_bar((i + 1) / len(items), desc=desc)
77
- # except Exception as e:
78
- # logger.exception("Task failed: %s", e)
79
- # results.append(e)
80
- #
81
- # return results
82
-
83
-
84
  async def run_concurrent(
85
  coro_fn: Callable[[T], Awaitable[R]],
86
  items: List[T],
 
10
  R = TypeVar("R")
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  async def run_concurrent(
14
  coro_fn: Callable[[T], Awaitable[R]],
15
  items: List[T],