Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

chenzihong-gavin commited on Jun 4

Commit

acd7cf4

1 Parent(s): 6505eee

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +6 -0
.gitignore +179 -0
LICENSE +201 -0
graphgen/__init__.py +0 -0
graphgen/configs/config.yaml.example +16 -0
graphgen/configs/graphgen_config.yaml +16 -0
graphgen/evaluate.py +142 -0
graphgen/generate.py +101 -0
graphgen/graphgen.py +260 -0
graphgen/judge.py +60 -0
graphgen/models/__init__.py +41 -0
graphgen/models/embed/__init__.py +0 -0
graphgen/models/embed/embedding.py +29 -0
graphgen/models/evaluate/__init__.py +0 -0
graphgen/models/evaluate/base_evaluator.py +51 -0
graphgen/models/evaluate/length_evaluator.py +22 -0
graphgen/models/evaluate/mtld_evaluator.py +76 -0
graphgen/models/evaluate/reward_evaluator.py +101 -0
graphgen/models/evaluate/uni_evaluator.py +159 -0
graphgen/models/llm/__init__.py +0 -0
graphgen/models/llm/limitter.py +88 -0
graphgen/models/llm/openai_model.py +130 -0
graphgen/models/llm/tokenizer.py +73 -0
graphgen/models/llm/topk_token_model.py +48 -0
graphgen/models/search/__init__.py +0 -0
graphgen/models/search/wiki_search.py +36 -0
graphgen/models/storage/__init__.py +0 -0
graphgen/models/storage/base_storage.py +94 -0
graphgen/models/storage/json_storage.py +51 -0
graphgen/models/storage/networkx_storage.py +159 -0
graphgen/models/strategy/__init__.py +0 -0
graphgen/models/strategy/base_strategy.py +5 -0
graphgen/models/strategy/travserse_strategy.py +30 -0
graphgen/models/text/__init__.py +0 -0
graphgen/models/text/chunk.py +7 -0
graphgen/models/text/text_pair.py +9 -0
graphgen/operators/__init__.py +16 -0
graphgen/operators/extract_kg.py +132 -0
graphgen/operators/judge.py +188 -0
graphgen/operators/merge_kg.py +215 -0
graphgen/operators/quiz.py +109 -0
graphgen/operators/resolute_coreference.py +33 -0
graphgen/operators/search_wikipedia.py +71 -0
graphgen/operators/split_graph.py +333 -0
graphgen/operators/traverse_graph.py +485 -0
graphgen/templates/__init__.py +9 -0
graphgen/templates/answer_rephrasing.py +219 -0
graphgen/templates/coreference_resolution.py +39 -0
graphgen/templates/description_rephrasing.py +121 -0
graphgen/templates/kg_extraction.py +210 -0

.env.example ADDED Viewed

	@@ -0,0 +1,6 @@

+SYNTHESIZER_MODEL=
+SYNTHESIZER_BASE_URL=
+SYNTHESIZER_API_KEY=
+TRAINEE_MODEL=
+TRAINEE_BASE_URL=
+TRAINEE_API_KEY=

.gitignore ADDED Viewed

	@@ -0,0 +1,179 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+cache
+*.pyc
+*.html
+.gradio

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

graphgen/__init__.py ADDED Viewed

File without changes

graphgen/configs/config.yaml.example ADDED Viewed

	@@ -0,0 +1,16 @@

+data_type: raw
+input_file: resources/examples/raw_demo.jsonl
+tokenizer: cl100k_base
+quiz_samples: 2
+traverse_strategy:
+  qa_form: atomic
+  bidirectional: true
+  edge_sampling: max_loss
+  expand_method: max_tokens
+  isolated_node_strategy: add
+  max_depth: 2
+  max_extra_edges: 5
+  max_tokens: 256
+  loss_strategy: only_edge
+web_search: false
+re_judge: false

graphgen/configs/graphgen_config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data_type: raw
+input_file: resources/examples/raw_demo.jsonl
+tokenizer: cl100k_base
+quiz_samples: 2
+traverse_strategy:
+  qa_form: aggregated
+  bidirectional: true
+  edge_sampling: max_loss
+  expand_method: max_width
+  isolated_node_strategy: ignore
+  max_depth: 1
+  max_extra_edges: 2
+  max_tokens: 256
+  loss_strategy: only_edge
+web_search: false
+re_judge: false

graphgen/evaluate.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Evaluate the quality of the generated text using various metrics"""
+import os
+import json
+import argparse
+import pandas as pd
+from dotenv import load_dotenv
+from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
+from .utils import logger, set_logger
+sys_path = os.path.abspath(os.path.dirname(__file__))
+set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
+load_dotenv()
+def evaluate_length(corpus, tokenizer_name):
+    length_evaluator = LengthEvaluator(
+        tokenizer_name=tokenizer_name
+    )
+    logger.info("Length evaluator loaded")
+    scores = length_evaluator.get_average_score(corpus)
+    logger.info("Length scores: %s", scores)
+    return scores
+def evaluate_mtld(corpus):
+    mtld_evaluator = MTLDEvaluator()
+    logger.info("MTLD evaluator loaded")
+    scores = mtld_evaluator.get_average_score(corpus)
+    logger.info("MTLD scores: %s", scores)
+    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
+    logger.info("MTLD min max scores: %s", min_max_scores)
+    return scores, min_max_scores
+def evaluate_reward(corpus, reward_model_names):
+    scores = []
+    for reward_name in reward_model_names:
+        reward_evaluator = RewardEvaluator(
+            reward_name=reward_name
+        )
+        logger.info("Loaded reward model: %s", reward_name)
+        average_score = reward_evaluator.get_average_score(corpus)
+        logger.info("%s scores: %s", reward_name, average_score)
+        min_max_scores = reward_evaluator.get_min_max_score(corpus)
+        logger.info("%s min max scores: %s", reward_name, min_max_scores)
+        scores.append({
+            'reward_name': reward_name.split('/')[-1],
+            'score': average_score,
+            'min_max_scores': min_max_scores
+        })
+        del reward_evaluator
+        clean_gpu_cache()
+    return scores
+def evaluate_uni(corpus, uni_model_name):
+    uni_evaluator = UniEvaluator(
+        model_name=uni_model_name
+    )
+    logger.info("Uni evaluator loaded with model %s", uni_model_name)
+    uni_scores = uni_evaluator.get_average_score(corpus)
+    for key, value in uni_scores.items():
+        logger.info("Uni %s scores: %s", key, value)
+    min_max_scores = uni_evaluator.get_min_max_score(corpus)
+    for key, value in min_max_scores.items():
+        logger.info("Uni %s min max scores: %s", key, value)
+    del uni_evaluator
+    clean_gpu_cache()
+    return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
+            min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
+def clean_gpu_cache():
+    import torch
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+if __name__ == '__main__':
+    import torch.multiprocessing as mp
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
+    parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
+    parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
+    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
+                        help='Comma-separated list of reward models')
+    parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
+    args = parser.parse_args()
+    if not os.path.exists(args.folder):
+        raise ValueError(f"Folder {args.folder} does not exist")
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    reward_models = args.reward.split(',')
+    results = []
+    logger.info("Data loaded from %s", args.folder)
+    mp.set_start_method('spawn')
+    for file in os.listdir(args.folder):
+        if file.endswith('.json'):
+            logger.info("Processing %s", file)
+            with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            data = [TextPair(
+                question=data[key]['question'],
+                answer=data[key]['answer']
+            ) for key in data]
+            length_scores = evaluate_length(data, args.tokenizer)
+            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
+            reward_scores = evaluate_reward(data, reward_models)
+            uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
+            min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
+                = evaluate_uni(data, args.uni)
+            result = {
+                'file': file,
+                'number': len(data),
+                'length': length_scores,
+                'mtld': mtld_scores,
+                'mtld_min_max': min_max_mtld_scores,
+                'uni_naturalness': uni_naturalness_scores,
+                'uni_coherence': uni_coherence_scores,
+                'uni_understandability': uni_understandability_scores,
+                'uni_naturalness_min_max': min_max_uni_naturalness_scores,
+                'uni_coherence_min_max': min_max_uni_coherence_scores,
+                'uni_understandability_min_max': min_max_uni_understandability_scores
+            }
+            for reward_score in reward_scores:
+                result[reward_score['reward_name']] = reward_score['score']
+                result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
+            results.append(result)
+    results = pd.DataFrame(results)
+    results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)

graphgen/generate.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import json
+import time
+import argparse
+from importlib.resources import files
+import yaml
+from dotenv import load_dotenv
+from .graphgen import GraphGen
+from .models import OpenAIModel, Tokenizer, TraverseStrategy
+from .utils import set_logger
+sys_path = os.path.abspath(os.path.dirname(__file__))
+load_dotenv()
+def set_working_dir(folder):
+    os.makedirs(folder, exist_ok=True)
+    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
+    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
+def save_config(config_path, global_config):
+    if not os.path.exists(os.path.dirname(config_path)):
+        os.makedirs(os.path.dirname(config_path))
+    with open(config_path, "w", encoding='utf-8') as config_file:
+        yaml.dump(global_config, config_file, default_flow_style=False, allow_unicode=True)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config_file',
+                        help='Config parameters for GraphGen.',
+                        # default=os.path.join(sys_path, "configs", "graphgen_config.yaml"),
+                        default=files('graphgen').joinpath("configs", "graphgen_config.yaml"),
+                        type=str)
+    parser.add_argument('--output_dir',
+                        help='Output directory for GraphGen.',
+                        default=sys_path,
+                        required=True,
+                        type=str)
+    args = parser.parse_args()
+    working_dir = args.output_dir
+    set_working_dir(working_dir)
+    unique_id = int(time.time())
+    set_logger(os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), if_stream=False)
+    with open(args.config_file, "r", encoding='utf-8') as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    input_file = config['input_file']
+    if config['data_type'] == 'raw':
+        with open(input_file, "r", encoding='utf-8') as f:
+            data = [json.loads(line) for line in f]
+    elif config['data_type'] == 'chunked':
+        with open(input_file, "r", encoding='utf-8') as f:
+            data = json.load(f)
+    else:
+        raise ValueError(f"Invalid data type: {config['data_type']}")
+    synthesizer_llm_client = OpenAIModel(
+        model_name=os.getenv("SYNTHESIZER_MODEL"),
+        api_key=os.getenv("SYNTHESIZER_API_KEY"),
+        base_url=os.getenv("SYNTHESIZER_BASE_URL")
+    )
+    trainee_llm_client = OpenAIModel(
+        model_name=os.getenv("TRAINEE_MODEL"),
+        api_key=os.getenv("TRAINEE_API_KEY"),
+        base_url=os.getenv("TRAINEE_BASE_URL")
+    )
+    traverse_strategy = TraverseStrategy(
+        **config['traverse_strategy']
+    )
+    graph_gen = GraphGen(
+        working_dir=working_dir,
+        unique_id=unique_id,
+        synthesizer_llm_client=synthesizer_llm_client,
+        trainee_llm_client=trainee_llm_client,
+        if_web_search=config['web_search'],
+        tokenizer_instance=Tokenizer(
+            model_name=config['tokenizer']
+        ),
+        traverse_strategy=traverse_strategy
+    )
+    graph_gen.insert(data, config['data_type'])
+    graph_gen.quiz(max_samples=config['quiz_samples'])
+    graph_gen.judge(re_judge=config["re_judge"])
+    graph_gen.traverse()
+    path = os.path.join(working_dir, "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml")
+    save_config(path, config)
+if __name__ == '__main__':
+    main()

graphgen/graphgen.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Adapt from https://github.com/HKUDS/LightRAG
+import asyncio
+import os
+import time
+from dataclasses import dataclass, field
+from typing import List, Union, cast
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+from .models import (
+    Chunk,
+    JsonKVStorage,
+    NetworkXStorage,
+    OpenAIModel,
+    Tokenizer,
+    TraverseStrategy,
+    WikiSearch,
+)
+from .models.storage.base_storage import StorageNameSpace
+from .operators import (
+    extract_kg,
+    judge_statement,
+    quiz,
+    search_wikipedia,
+    skip_judge_statement,
+    traverse_graph_atomically,
+    traverse_graph_by_edge,
+    traverse_graph_for_multi_hop,
+)
+from .utils import compute_content_hash, create_event_loop, logger
+sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+@dataclass
+class GraphGen:
+    unique_id: int = int(time.time())
+    working_dir: str = os.path.join(sys_path, "cache")
+    # text chunking
+    chunk_size: int = 1024
+    chunk_overlap_size: int = 100
+    # llm
+    synthesizer_llm_client: OpenAIModel = None
+    trainee_llm_client: OpenAIModel = None
+    tokenizer_instance: Tokenizer = None
+    # web search
+    if_web_search: bool = False
+    wiki_client: WikiSearch = field(default_factory=WikiSearch)
+    # traverse strategy
+    traverse_strategy: TraverseStrategy = field(default_factory=TraverseStrategy)
+    # webui
+    progress_bar: gr.Progress = None
+    def __post_init__(self):
+        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="full_docs"
+        )
+        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="text_chunks"
+        )
+        self.wiki_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="wiki"
+        )
+        self.graph_storage: NetworkXStorage = NetworkXStorage(
+            self.working_dir, namespace="graph"
+        )
+        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="rephrase"
+        )
+        self.qa_storage: JsonKVStorage = JsonKVStorage(
+            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)), namespace=f"qa-{self.unique_id}"
+        )
+    async def async_split_chunks(self, data: Union[List[list], List[dict]], data_type: str) -> dict:
+        # TODO： 是否进行指代消解
+        if len(data) == 0:
+            return {}
+        new_docs = {}
+        inserting_chunks = {}
+        if data_type == "raw":
+            assert isinstance(data, list) and isinstance(data[0], dict)
+            # compute hash for each document
+            new_docs = {
+                compute_content_hash(doc['content'], prefix="doc-"): {'content': doc['content']} for doc in data
+            }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if len(new_docs) == 0:
+                logger.warning("All docs are already in the storage")
+                return {}
+            logger.info("[New Docs] inserting %d docs", len(new_docs))
+            cur_index = 1
+            doc_number = len(new_docs)
+            async for doc_key, doc in tqdm_async(
+                    new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
+                ):
+                chunks = {
+                    compute_content_hash(dp["content"], prefix="chunk-"): {
+                        **dp,
+                        'full_doc_id': doc_key
+                    } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"],
+                                                                            self.chunk_overlap_size, self.chunk_size)
+                }
+                inserting_chunks.update(chunks)
+                if self.progress_bar is not None:
+                    self.progress_bar(
+                        cur_index / doc_number, f"Chunking {doc_key}"
+                    )
+                    cur_index += 1
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
+            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
+        elif data_type == "chunked":
+            assert isinstance(data, list) and isinstance(data[0], list)
+            new_docs = {
+                compute_content_hash("".join(chunk['content']), prefix="doc-"): {'content': "".join(chunk['content'])}
+                for doc in data for chunk in doc
+            }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if len(new_docs) == 0:
+                logger.warning("All docs are already in the storage")
+                return {}
+            logger.info("[New Docs] inserting %d docs", len(new_docs))
+            async for doc in tqdm_async(data, desc="[1/4]Chunking documents", unit="doc"):
+                doc_str = "".join([chunk['content'] for chunk in doc])
+                for chunk in doc:
+                    chunk_key = compute_content_hash(chunk['content'], prefix="chunk-")
+                    inserting_chunks[chunk_key] = {
+                        **chunk,
+                        'full_doc_id': compute_content_hash(doc_str, prefix="doc-")
+                    }
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
+            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
+        await self.full_docs_storage.upsert(new_docs)
+        await self.text_chunks_storage.upsert(inserting_chunks)
+        return inserting_chunks
+    def insert(self, data: Union[List[list], List[dict]], data_type: str):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_insert(data, data_type))
+    async def async_insert(self, data: Union[List[list], List[dict]], data_type: str):
+        """
+        insert chunks into the graph
+        """
+        inserting_chunks = await self.async_split_chunks(data, data_type)
+        if len(inserting_chunks) == 0:
+            logger.warning("All chunks are already in the storage")
+            return
+        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
+        logger.info("[Entity and Relation Extraction]...")
+        _add_entities_and_relations = await extract_kg(
+            llm_client=self.synthesizer_llm_client,
+            kg_instance=self.graph_storage,
+            tokenizer_instance=self.tokenizer_instance,
+            chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()],
+            progress_bar = self.progress_bar,
+        )
+        if not _add_entities_and_relations:
+            logger.warning("No entities or relations extracted")
+            return
+        logger.info("[Wiki Search] is %s", 'enabled' if self.if_web_search else 'disabled')
+        if self.if_web_search:
+            logger.info("[Wiki Search]...")
+            _add_wiki_data = await search_wikipedia(
+                llm_client= self.synthesizer_llm_client,
+                wiki_search_client=self.wiki_client,
+                knowledge_graph_instance=_add_entities_and_relations
+            )
+            await self.wiki_storage.upsert(_add_wiki_data)
+        await self._insert_done()
+    async def _insert_done(self):
+        tasks = []
+        for storage_instance in [self.full_docs_storage, self.text_chunks_storage,
+                                 self.graph_storage, self.wiki_storage]:
+            if storage_instance is None:
+                continue
+            tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
+        await asyncio.gather(*tasks)
+    def quiz(self, max_samples=1):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_quiz(max_samples))
+    async def async_quiz(self, max_samples=1):
+        await quiz(self.synthesizer_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
+        await self.rephrase_storage.index_done_callback()
+    def judge(self, re_judge=False, skip=False):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_judge(re_judge, skip))
+    async def async_judge(self, re_judge=False, skip=False):
+        if skip:
+            _update_relations = await skip_judge_statement(self.graph_storage)
+        else:
+            _update_relations = await judge_statement(self.trainee_llm_client, self.graph_storage,
+                                                      self.rephrase_storage, re_judge)
+        await _update_relations.index_done_callback()
+    def traverse(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_traverse())
+    async def async_traverse(self):
+        if self.traverse_strategy.qa_form == "atomic":
+            results = await traverse_graph_atomically(self.synthesizer_llm_client,
+                                                      self.tokenizer_instance,
+                                                      self.graph_storage,
+                                                      self.traverse_strategy,
+                                                      self.text_chunks_storage,
+                                                      self.progress_bar)
+        elif self.traverse_strategy.qa_form == "multi_hop":
+            results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client,
+                                                            self.tokenizer_instance,
+                                                            self.graph_storage,
+                                                            self.traverse_strategy,
+                                                            self.text_chunks_storage,
+                                                            self.progress_bar)
+        elif self.traverse_strategy.qa_form == "aggregated":
+            results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
+                                                   self.graph_storage, self.traverse_strategy, self.text_chunks_storage,
+                                                   self.progress_bar)
+        else:
+            raise ValueError(f"Unknown qa_form: {self.traverse_strategy.qa_form}")
+        await self.qa_storage.upsert(results)
+        await self.qa_storage.index_done_callback()
+    def clear(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_clear())
+    async def async_clear(self):
+        await self.full_docs_storage.drop()
+        await self.text_chunks_storage.drop()
+        await self.wiki_storage.drop()
+        await self.graph_storage.clear()
+        await self.rephrase_storage.drop()
+        await self.qa_storage.drop()
+        logger.info("All caches are cleared")

graphgen/judge.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import argparse
+import asyncio
+from dotenv import load_dotenv
+from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
+from .operators import judge_statement
+sys_path = os.path.abspath(os.path.dirname(__file__))
+load_dotenv()
+def calculate_average_loss(graph: NetworkXStorage):
+    """
+    Calculate the average loss of the graph.
+    :param graph: NetworkXStorage
+    :return: float
+    """
+    edges = asyncio.run(graph.get_all_edges())
+    total_loss = 0
+    for edge in edges:
+        total_loss += edge[2]['loss']
+    return total_loss / len(edges)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
+    parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
+    args = parser.parse_args()
+    llm_client = OpenAIModel(
+        model_name=os.getenv("TRAINEE_MODEL"),
+        api_key=os.getenv("TRAINEE_API_KEY"),
+        base_url=os.getenv("TRAINEE_BASE_URL")
+    )
+    graph_storage = NetworkXStorage(
+        args.input,
+        namespace="graph"
+    )
+    average_loss = calculate_average_loss(graph_storage)
+    print(f"Average loss of the graph: {average_loss}")
+    rephrase_storage = JsonKVStorage(
+        os.path.join(sys_path, "cache"),
+        namespace="rephrase"
+    )
+    new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
+    graph_file = asyncio.run(graph_storage.get_graph())
+    new_graph.write_nx_graph(graph_file, args.output)
+    average_loss = calculate_average_loss(new_graph)
+    print(f"Average loss of the graph: {average_loss}")

graphgen/models/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from .text.chunk import Chunk
+from .text.text_pair import TextPair
+from .llm.topk_token_model import Token, TopkTokenModel
+from .llm.openai_model import OpenAIModel
+from .llm.tokenizer import Tokenizer
+from .storage.networkx_storage import NetworkXStorage
+from .storage.json_storage import JsonKVStorage
+from .search.wiki_search import WikiSearch
+from .evaluate.length_evaluator import LengthEvaluator
+from .evaluate.mtld_evaluator import MTLDEvaluator
+from .evaluate.reward_evaluator import RewardEvaluator
+from .evaluate.uni_evaluator import UniEvaluator
+from .strategy.travserse_strategy import TraverseStrategy
+__all__ = [
+    # llm models
+    "OpenAIModel",
+    "TopkTokenModel",
+    "Token",
+    "Tokenizer",
+    # storage models
+    "Chunk",
+    "NetworkXStorage",
+    "JsonKVStorage",
+    # search models
+    "WikiSearch",
+    # evaluate models
+    "TextPair",
+    "LengthEvaluator",
+    "MTLDEvaluator",
+    "RewardEvaluator",
+    "UniEvaluator",
+    # strategy models
+    "TraverseStrategy",
+]

graphgen/models/embed/__init__.py ADDED Viewed

File without changes

graphgen/models/embed/embedding.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dataclasses import dataclass
+import asyncio
+import numpy as np
+class UnlimitedSemaphore:
+    """A context manager that allows unlimited access."""
+    async def __aenter__(self):
+        pass
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+@dataclass
+class EmbeddingFunc:
+    embedding_dim: int
+    max_token_size: int
+    func: callable
+    concurrent_limit: int = 16
+    def __post_init__(self):
+        if self.concurrent_limit != 0:
+            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
+        else:
+            self._semaphore = UnlimitedSemaphore()
+    async def __call__(self, *args, **kwargs) -> np.ndarray:
+        async with self._semaphore:
+            return await self.func(*args, **kwargs)

graphgen/models/evaluate/__init__.py ADDED Viewed

File without changes

graphgen/models/evaluate/base_evaluator.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import asyncio
+from dataclasses import dataclass
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.utils import create_event_loop
+from graphgen.models.text.text_pair import TextPair
+@dataclass
+class BaseEvaluator:
+    max_concurrent: int = 100
+    results: list[float] = None
+    def evaluate(self, pairs: list[TextPair]) -> list[float]:
+        """
+        Evaluate the text and return a score.
+        """
+        return create_event_loop().run_until_complete(self.async_evaluate(pairs))
+    async def async_evaluate(self, pairs: list[TextPair]) -> list[float]:
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+        async def evaluate_with_semaphore(pair):
+            async with semaphore:  # 获取Semaphore
+                return await self.evaluate_single(pair)
+        results = []
+        for result in tqdm_async(
+            asyncio.as_completed([evaluate_with_semaphore(pair) for pair in pairs]),
+            total=len(pairs),
+        ):
+            results.append(await result)
+        return results
+    async def evaluate_single(self, pair: TextPair) -> float:
+        raise NotImplementedError()
+    def get_average_score(self, pairs: list[TextPair]) -> float:
+        """
+        Get the average score of a batch of texts.
+        """
+        results = self.evaluate(pairs)
+        self.results = results
+        return sum(self.results) / len(pairs)
+    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
+        """
+        Get the min and max score of a batch of texts.
+        """
+        if self.results is None:
+            self.get_average_score(pairs)
+        return min(self.results), max(self.results)

graphgen/models/evaluate/length_evaluator.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from dataclasses import dataclass
+from graphgen.models.evaluate.base_evaluator import BaseEvaluator
+from graphgen.models.llm.tokenizer import Tokenizer
+from graphgen.models.text.text_pair import TextPair
+from graphgen.utils import create_event_loop
+@dataclass
+class LengthEvaluator(BaseEvaluator):
+    tokenizer_name: str = "cl100k_base"
+    def __post_init__(self):
+        self.tokenizer = Tokenizer(
+            model_name=self.tokenizer_name
+        )
+    async def evaluate_single(self, pair: TextPair) -> float:
+        loop = create_event_loop()
+        return await loop.run_in_executor(None, self._calculate_length, pair.answer)
+    def _calculate_length(self, text: str) -> float:
+        tokens = self.tokenizer.encode_string(text)
+        return len(tokens)

graphgen/models/evaluate/mtld_evaluator.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from dataclasses import  dataclass, field
+from typing import Set
+from graphgen.models.evaluate.base_evaluator import BaseEvaluator
+from graphgen.models.text.text_pair import TextPair
+from graphgen.utils import detect_main_language, NLTKHelper, create_event_loop
+nltk_helper = NLTKHelper()
+@dataclass
+class MTLDEvaluator(BaseEvaluator):
+    """
+    衡量文本词汇多样性的指标
+    """
+    stopwords_en: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("english")))
+    stopwords_zh: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("chinese")))
+    async def evaluate_single(self, pair: TextPair) -> float:
+        loop = create_event_loop()
+        return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer)
+    def _calculate_mtld_score(self, text: str, threshold=0.72) -> float:
+        """
+        计算MTLD (向前和向后的平均值)
+        min is 1.0
+        higher is better
+        """
+        if not text or not text.strip():
+            return 0.0
+        lang = detect_main_language(text)
+        tokens = nltk_helper.word_tokenize(text, lang)
+        stopwords = self.stopwords_zh if lang == "zh" else self.stopwords_en
+        filtered_tokens = [word for word in tokens if word not in stopwords]
+        filtered_tokens = [word for word in filtered_tokens if word.isalnum()]
+        if not filtered_tokens:
+            return 0
+        # 计算向前的MTLD
+        forward_factors = self._compute_factors(filtered_tokens, threshold)
+        # 计算向后的MTLD
+        backward_factors = self._compute_factors(filtered_tokens[::-1], threshold)
+        # 取平均值
+        return (forward_factors + backward_factors) / 2
+    @staticmethod
+    def _compute_factors(tokens: list, threshold: float) -> float:
+        factors = 0
+        current_segment = []
+        unique_words = set()
+        for token in tokens:
+            current_segment.append(token)
+            unique_words.add(token)
+            ttr = len(unique_words) / len(current_segment)
+            if ttr <= threshold:
+                factors += 1
+                current_segment = []
+                unique_words = set()
+        # 处理最后一个不完整片段
+        if current_segment:
+            ttr = len(unique_words) / len(current_segment)
+            if ttr <= threshold:
+                factors += 1
+            else:
+                factors += (1 - (ttr - threshold) / (1 - threshold))
+        return len(tokens) / factors if factors > 0 else len(tokens)

graphgen/models/evaluate/reward_evaluator.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from dataclasses import dataclass
+from tqdm import tqdm
+from graphgen.models.text.text_pair import TextPair
+@dataclass
+class RewardEvaluator:
+    """
+    Reward Model Evaluator.
+    OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf]，越高越好
+    """
+    reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
+    max_length: int = 2560
+    results: list[float] = None
+    def __post_init__(self):
+        import torch
+        self.num_gpus = torch.cuda.device_count()
+    @staticmethod
+    def process_chunk(rank, pairs, reward_name, max_length, return_dict):
+        import torch
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        device = f'cuda:{rank}'
+        torch.cuda.set_device(rank)
+        rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
+        tokenizer = AutoTokenizer.from_pretrained(reward_name)
+        rank_model.to(device)
+        rank_model.eval()
+        results = []
+        with torch.no_grad():
+            for pair in tqdm(pairs):
+                inputs = tokenizer(
+                    pair.question,
+                    pair.answer,
+                    return_tensors="pt",
+                    max_length=max_length,
+                    truncation=True
+                )
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                score = rank_model(**inputs).logits[0].item()
+                results.append(score)
+        return_dict[rank] = results
+    def evaluate(self, pairs: list[TextPair]) -> list[float]:
+        import torch.multiprocessing as mp
+        chunk_size = len(pairs) // self.num_gpus
+        chunks = []
+        for i in range(self.num_gpus):
+            start = i * chunk_size
+            end = start + chunk_size
+            if i == self.num_gpus - 1:
+                end = len(pairs)
+            chunks.append(pairs[start:end])
+        # multi-process
+        manager = mp.Manager()
+        return_dict = manager.dict()
+        processes = []
+        for rank, chunk in enumerate(chunks):
+            p = mp.Process(
+                target=self.process_chunk,
+                args=(rank, chunk, self.reward_name, self.max_length, return_dict)
+            )
+            p.start()
+            processes.append(p)
+        for p in processes:
+            p.join()
+        # 合并结果
+        results = []
+        for rank in range(len(chunks)):
+            results.extend(return_dict[rank])
+        for p in processes:
+            if p.is_alive():
+                p.terminate()
+                p.join()
+        return results
+    def get_average_score(self, pairs: list[TextPair]) -> float:
+        """
+        Get the average score of a batch of texts.
+        """
+        results = self.evaluate(pairs)
+        self.results = results
+        return sum(self.results) / len(pairs)
+    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
+        """
+        Get the min and max score of a batch of texts.
+        """
+        if self.results is None:
+            self.get_average_score(pairs)
+        return min(self.results), max(self.results)

graphgen/models/evaluate/uni_evaluator.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# https://github.com/maszhongming/UniEval/tree/main
+from dataclasses import dataclass, field
+from tqdm import tqdm
+from graphgen.models.text.text_pair import TextPair
+def _add_questions(dimension: str, question: str, answer: str):
+    if dimension == "naturalness":
+        cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer
+    elif dimension == "coherence":
+        cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \
+                    + answer + ' </s> dialogue history: ' + question
+    elif dimension == "understandability":
+        cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer
+    else:
+        raise NotImplementedError(
+            'The input format for this dimension is still undefined. Please customize it first.')
+    return cur_input
+@dataclass
+class UniEvaluator:
+    model_name: str = "MingZhong/unieval-sum"
+    dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability'])
+    max_length: int = 2560
+    results: dict = None
+    def __post_init__(self):
+        import torch
+        self.num_gpus = torch.cuda.device_count()
+        self.results = {}
+    @staticmethod
+    def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
+        import torch
+        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+        device = f'cuda:{rank}'
+        torch.cuda.set_device(rank)
+        rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        rank_model.to(device)
+        rank_model.eval()
+        softmax = torch.nn.Softmax(dim=1)
+        pos_id = tokenizer("Yes")["input_ids"][0]
+        neg_id = tokenizer("No")["input_ids"][0]
+        results = []
+        with torch.no_grad():
+            for pair in tqdm(pairs):
+                text = _add_questions(dimension, pair.question, pair.answer)
+                tgt = "No"
+                encoded_src = tokenizer(
+                    text,
+                    max_length=max_length,
+                    truncation=True,
+                    padding=True,
+                    return_tensors='pt'
+                )
+                encoded_tgt = tokenizer(
+                    tgt,
+                    max_length=max_length,
+                    truncation=True,
+                    padding=True,
+                    return_tensors='pt'
+                )
+                src_tokens = encoded_src['input_ids'].to(device)
+                src_mask = encoded_src['attention_mask'].to(device)
+                tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1)
+                output = rank_model(
+                    input_ids=src_tokens,
+                    attention_mask=src_mask,
+                    labels=tgt_tokens,
+                    use_cache = False
+                )
+                logits = output.logits.view(-1, rank_model.config.vocab_size)
+                pos_score = softmax(logits)[:, pos_id]  # Yes
+                neg_score = softmax(logits)[:, neg_id]
+                score = pos_score / (pos_score + neg_score)
+                results.append(score.item())
+        return_dict[rank] = results
+    def evaluate(self, pairs: list[TextPair]) -> list[dict]:
+        import torch.multiprocessing as mp
+        final_results = []
+        for dimension in self.dimensions:
+            chunk_size = len(pairs) // self.num_gpus
+            chunks = []
+            for i in range(self.num_gpus):
+                start = i * chunk_size
+                end = start + chunk_size
+                if i == self.num_gpus - 1:
+                    end = len(pairs)
+                chunks.append(pairs[start:end])
+            # multi-process
+            manager = mp.Manager()
+            return_dict = manager.dict()
+            processes = []
+            for rank, chunk in enumerate(chunks):
+                p = mp.Process(
+                    target=self.process_chunk,
+                    args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict)
+                )
+                p.start()
+                processes.append(p)
+            for p in processes:
+                p.join()
+            # 合并结果
+            results = []
+            for rank in range(len(chunks)):
+                results.extend(return_dict[rank])
+            for p in processes:
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
+            final_results.append({
+                dimension: results
+            })
+        return final_results
+    def get_average_score(self, pairs: list[TextPair]) -> dict:
+        """
+        Get the average score of a batch of texts.
+        """
+        results = self.evaluate(pairs)
+        final_results = {}
+        for result in results:
+            for key, value in result.items():
+                final_results[key] = sum(value) / len(value)
+                self.results[key] = value
+        return final_results
+    def get_min_max_score(self, pairs: list[TextPair]) -> dict:
+        """
+        Get the min and max score of a batch of texts.
+        """
+        if self.results is None:
+            self.get_average_score(pairs)
+        final_results = {}
+        for key, value in self.results.items():
+            final_results[key] = min(value), max(value)
+        return final_results

graphgen/models/llm/__init__.py ADDED Viewed

File without changes

graphgen/models/llm/limitter.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import time
+from datetime import datetime, timedelta
+import asyncio
+from graphgen.utils import logger
+class RPM:
+    def __init__(self, rpm: int = 1000):
+        self.rpm = rpm
+        self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
+    def get_minute_slot(self):
+        current_time = time.time()
+        dt_object = datetime.fromtimestamp(current_time)
+        total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
+        return total_minutes_since_midnight
+    async def wait(self, silent=False):
+        current = time.time()
+        dt_object = datetime.fromtimestamp(current)
+        minute_slot = self.get_minute_slot()
+        if self.record['rpm_slot'] == minute_slot:
+            # check RPM exceed
+            if self.record['counter'] >= self.rpm:
+                # wait until next minute
+                next_minute = dt_object.replace(
+                    second=0, microsecond=0) + timedelta(minutes=1)
+                _next = next_minute.timestamp()
+                sleep_time = abs(_next - current)
+                if not silent:
+                    logger.info('RPM sleep %s', sleep_time)
+                await asyncio.sleep(sleep_time)
+                self.record = {
+                    'rpm_slot': self.get_minute_slot(),
+                    'counter': 0
+                }
+        else:
+            self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
+        self.record['counter'] += 1
+        if not silent:
+            logger.debug(self.record)
+class TPM:
+    def __init__(self, tpm: int = 20000):
+        self.tpm = tpm
+        self.record = {'tpm_slot': self.get_minute_slot(), 'counter': 0}
+    def get_minute_slot(self):
+        current_time = time.time()
+        dt_object = datetime.fromtimestamp(current_time)
+        total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
+        return total_minutes_since_midnight
+    async def wait(self, token_count, silent=False):
+        current = time.time()
+        dt_object = datetime.fromtimestamp(current)
+        minute_slot = self.get_minute_slot()
+        # get next slot, skip
+        if self.record['tpm_slot'] != minute_slot:
+            self.record = {'tpm_slot': minute_slot, 'counter': token_count}
+            return
+        # check RPM exceed
+        self.record['counter'] += token_count
+        if self.record['counter'] > self.tpm:
+            # wait until next minute
+            next_minute = dt_object.replace(
+                second=0, microsecond=0) + timedelta(minutes=1)
+            _next = next_minute.timestamp()
+            sleep_time = abs(_next - current)
+            logger.info('TPM sleep %s', sleep_time)
+            await asyncio.sleep(sleep_time)
+            self.record = {
+                'tpm_slot': self.get_minute_slot(),
+                'counter': token_count
+            }
+        if not silent:
+            logger.debug(self.record)

graphgen/models/llm/openai_model.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import math
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional
+import openai
+from openai import AsyncOpenAI, RateLimitError, APIConnectionError, APITimeoutError
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+from graphgen.models.llm.topk_token_model import TopkTokenModel, Token
+from graphgen.models.llm.tokenizer import Tokenizer
+from graphgen.models.llm.limitter import RPM, TPM
+def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
+    token_logprobs = response.choices[0].logprobs.content
+    tokens = []
+    for token_prob in token_logprobs:
+        prob = math.exp(token_prob.logprob)
+        candidate_tokens = [
+            Token(t.token, math.exp(t.logprob))
+            for t in token_prob.top_logprobs
+        ]
+        token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
+        tokens.append(token)
+    return tokens
+@dataclass
+class OpenAIModel(TopkTokenModel):
+    model_name: str = "gpt-4o-mini"
+    api_key: str = None
+    base_url: str = None
+    system_prompt: str = ""
+    json_mode: bool = False
+    seed: int = None
+    token_usage: list = field(default_factory=list)
+    request_limit: bool = False
+    rpm: RPM = field(default_factory=lambda: RPM(rpm=1000))
+    tpm: TPM = field(default_factory=lambda: TPM(tpm=50000))
+    def __post_init__(self):
+        assert self.api_key is not None, "Please provide api key to access openai api."
+        if self.api_key == "":
+            self.api_key = "none"
+        self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
+    def _pre_generate(self, text: str, history: List[str]) -> Dict:
+        kwargs = {
+            "temperature": self.temperature,
+            "top_p": self.topp,
+            "max_tokens": self.max_tokens,
+        }
+        if self.seed:
+            kwargs["seed"] = self.seed
+        if self.json_mode:
+            kwargs["response_format"] = {"type": "json_object"}
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": text})
+        if history:
+            assert len(history) % 2 == 0, "History should have even number of elements."
+            messages = history + messages
+        kwargs['messages']= messages
+        return kwargs
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
+    )
+    async def generate_topk_per_token(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
+        kwargs = self._pre_generate(text, history)
+        if self.topk_per_token > 0:
+            kwargs["logprobs"] = True
+            kwargs["top_logprobs"] = self.topk_per_token
+        # Limit max_tokens to 1 to avoid long completions
+        kwargs["max_tokens"] = 1
+        completion = await self.client.chat.completions.create( # pylint: disable=E1125
+            model=self.model_name,
+            **kwargs
+        )
+        tokens = get_top_response_tokens(completion)
+        return tokens
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
+    )
+    async def generate_answer(self, text: str, history: Optional[List[str]] = None, temperature: int = 0) -> str:
+        kwargs = self._pre_generate(text, history)
+        kwargs["temperature"] = temperature
+        prompt_tokens = 0
+        for message in kwargs['messages']:
+            prompt_tokens += len(Tokenizer().encode_string(message['content']))
+        estimated_tokens = prompt_tokens + kwargs['max_tokens']
+        if self.request_limit:
+            await self.rpm.wait(silent=True)
+            await self.tpm.wait(estimated_tokens, silent=True)
+        completion = await self.client.chat.completions.create( # pylint: disable=E1125
+            model=self.model_name,
+            **kwargs
+        )
+        if hasattr(completion, "usage"):
+            self.token_usage.append({
+                "prompt_tokens": completion.usage.prompt_tokens,
+                "completion_tokens": completion.usage.completion_tokens,
+                "total_tokens": completion.usage.total_tokens,
+            })
+        return completion.choices[0].message.content
+    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
+        raise NotImplementedError

graphgen/models/llm/tokenizer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from dataclasses import dataclass
+from typing import List
+import tiktoken
+try:
+    from transformers import AutoTokenizer
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    AutoTokenizer = None
+    TRANSFORMERS_AVAILABLE = False
+def get_tokenizer(tokenizer_name: str = "cl100k_base"):
+    """
+    Get a tokenizer instance by name.
+    :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name
+    :return: tokenizer instance
+    """
+    if tokenizer_name in tiktoken.list_encoding_names():
+        return tiktoken.get_encoding(tokenizer_name)
+    if TRANSFORMERS_AVAILABLE:
+        try:
+            return AutoTokenizer.from_pretrained(tokenizer_name)
+        except Exception as e:
+            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
+    else:
+        raise ValueError("Hugging Face Transformers is not available, please install it first.")
+@dataclass
+class Tokenizer:
+    model_name: str = "cl100k_base"
+    def __post_init__(self):
+        self.tokenizer = get_tokenizer(self.model_name)
+    def encode_string(self, text: str) -> List[int]:
+        """
+        Encode text to tokens
+        :param text
+        :return: tokens
+        """
+        return self.tokenizer.encode(text)
+    def decode_tokens(self, tokens: List[int]) -> str:
+        """
+        Decode tokens to text
+        :param tokens
+        :return: text
+        """
+        return self.tokenizer.decode(tokens)
+    def chunk_by_token_size(
+        self, content: str, overlap_token_size=128, max_token_size=1024
+    ):
+        tokens = self.encode_string(content)
+        results = []
+        for index, start in enumerate(
+            range(0, len(tokens), max_token_size - overlap_token_size)
+        ):
+            chunk_content = self.decode_tokens(
+                tokens[start : start + max_token_size]
+            )
+            results.append(
+                {
+                    "tokens": min(max_token_size, len(tokens) - start),
+                    "content": chunk_content.strip(),
+                    "chunk_order_index": index,
+                }
+            )
+        return results

graphgen/models/llm/topk_token_model.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import math
+from dataclasses import dataclass, field
+from typing import List, Union, Optional
+@dataclass
+class Token:
+    text: str
+    prob: float
+    top_candidates: List = field(default_factory=list)
+    ppl: Union[float, None] = field(default=None)
+    @property
+    def logprob(self) -> float:
+        return math.log(self.prob)
+@dataclass
+class TopkTokenModel:
+    do_sample: bool = False
+    temperature: float = 0
+    max_tokens: int = 4096
+    repetition_penalty: float = 1.05
+    num_beams: int = 1
+    topk: int = 50
+    topp: float = 0.95
+    topk_per_token: int = 5  # number of topk tokens to generate for each token
+    async def generate_topk_per_token(self, text: str) -> List[Token]:
+        """
+        Generate prob, text and candidates for each token of the model's output.
+        This function is used to visualize the inference process.
+        """
+        raise NotImplementedError
+    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
+        """
+        Generate prob and text for each token of the input text.
+        This function is used to visualize the ppl.
+        """
+        raise NotImplementedError
+    async def generate_answer(self, text: str, history: Optional[List[str]] = None) -> str:
+        """
+        Generate answer from the model.
+        """
+        raise NotImplementedError

graphgen/models/search/__init__.py ADDED Viewed

File without changes

graphgen/models/search/wiki_search.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import List, Union
+from dataclasses import dataclass
+import wikipedia
+from wikipedia import set_lang
+from graphgen.utils import detect_main_language, logger
+@dataclass
+class WikiSearch:
+    @staticmethod
+    def set_language(language: str):
+        assert language in ["en", "zh"], "Only support English and Chinese"
+        set_lang(language)
+    async def search(self, query: str) -> Union[List[str], None]:
+        self.set_language(detect_main_language(query))
+        return wikipedia.search(query)
+    async def summary(self, query: str) -> Union[str, None]:
+        self.set_language(detect_main_language(query))
+        try:
+            result = wikipedia.summary(query, auto_suggest=False, redirect=False)
+        except wikipedia.exceptions.DisambiguationError as e:
+            logger.error("DisambiguationError: %s", e)
+            result = None
+        return result
+    async def page(self, query: str) -> Union[str, None]:
+        self.set_language(detect_main_language(query))
+        try:
+            result = wikipedia.page(query, auto_suggest=False, redirect=False).content
+        except wikipedia.exceptions.DisambiguationError as e:
+            logger.error("DisambiguationError: %s", e)
+            result = None
+        return result

graphgen/models/storage/__init__.py ADDED Viewed

File without changes

graphgen/models/storage/base_storage.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from dataclasses import dataclass
+from typing import Union, Generic, TypeVar
+from graphgen.models.embed.embedding import EmbeddingFunc
+T = TypeVar("T")
+@dataclass
+class StorageNameSpace:
+    working_dir: str = None
+    namespace: str = None
+    async def index_done_callback(self):
+        """commit the storage operations after indexing"""
+    async def query_done_callback(self):
+        """commit the storage operations after querying"""
+@dataclass
+class BaseKVStorage(Generic[T], StorageNameSpace):
+    embedding_func: EmbeddingFunc = None
+    async def all_keys(self) -> list[str]:
+        raise NotImplementedError
+    async def get_by_id(self, id: str) -> Union[T, None]:
+        raise NotImplementedError
+    async def get_by_ids(
+        self, ids: list[str], fields: Union[set[str], None] = None
+    ) -> list[Union[T, None]]:
+        raise NotImplementedError
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        """return un-exist keys"""
+        raise NotImplementedError
+    async def upsert(self, data: dict[str, T]):
+        raise NotImplementedError
+    async def drop(self):
+        raise NotImplementedError
+@dataclass
+class BaseGraphStorage(StorageNameSpace):
+    embedding_func: EmbeddingFunc = None
+    async def has_node(self, node_id: str) -> bool:
+        raise NotImplementedError
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        raise NotImplementedError
+    async def node_degree(self, node_id: str) -> int:
+        raise NotImplementedError
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        raise NotImplementedError
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        raise NotImplementedError
+    async def update_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+    async def get_all_nodes(self) -> Union[list[dict], None]:
+        raise NotImplementedError
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        raise NotImplementedError
+    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
+        raise NotImplementedError
+    async def get_all_edges(self) -> Union[list[dict], None]:
+        raise NotImplementedError
+    async def get_node_edges(
+        self, source_node_id: str
+    ) -> Union[list[tuple[str, str]], None]:
+        raise NotImplementedError
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        raise NotImplementedError
+    async def delete_node(self, node_id: str):
+        raise NotImplementedError

graphgen/models/storage/json_storage.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from dataclasses import dataclass
+from graphgen.utils import logger, load_json, write_json
+from graphgen.models.storage.base_storage import BaseKVStorage
+@dataclass
+class JsonKVStorage(BaseKVStorage):
+    _data: dict[str, str] = None
+    def __post_init__(self):
+        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
+        self._data = load_json(self._file_name) or {}
+        logger.info("Load KV %s with %d data", self.namespace, len(self._data))
+    @property
+    def data(self):
+        return self._data
+    async def all_keys(self) -> list[str]:
+        return list(self._data.keys())
+    async def index_done_callback(self):
+        write_json(self._data, self._file_name)
+    async def get_by_id(self, id):
+        return self._data.get(id, None)
+    async def get_by_ids(self, ids, fields=None) -> list:
+        if fields is None:
+            return [self._data.get(id, None) for id in ids]
+        return [
+            (
+                {k: v for k, v in self._data[id].items() if k in fields}
+                if self._data.get(id, None)
+                else None
+            )
+            for id in ids
+        ]
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        return {s for s in data if s not in self._data}
+    async def upsert(self, data: dict):
+        left_data = {k: v for k, v in data.items() if k not in self._data}
+        self._data.update(left_data)
+        return left_data
+    async def drop(self):
+        self._data = {}

graphgen/models/storage/networkx_storage.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+import html
+from typing import Any, Union, cast, Optional
+from dataclasses import dataclass
+import networkx as nx
+from graphgen.utils import logger
+from .base_storage import BaseGraphStorage
+@dataclass
+class NetworkXStorage(BaseGraphStorage):
+    @staticmethod
+    def load_nx_graph(file_name) -> Optional[nx.Graph]:
+        if os.path.exists(file_name):
+            return nx.read_graphml(file_name)
+        return None
+    @staticmethod
+    def write_nx_graph(graph: nx.Graph, file_name):
+        logger.info("Writing graph with %d nodes, %d edges", graph.number_of_nodes(), graph.number_of_edges())
+        nx.write_graphml(graph, file_name)
+    @staticmethod
+    def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Return the largest connected component of the graph, with nodes and edges sorted in a stable way.
+        """
+        from graspologic.utils import largest_connected_component
+        graph = graph.copy()
+        graph = cast(nx.Graph, largest_connected_component(graph))
+        node_mapping = {
+            node: html.unescape(node.upper().strip()) for node in graph.nodes()
+        }  # type: ignore
+        graph = nx.relabel_nodes(graph, node_mapping)
+        return NetworkXStorage._stabilize_graph(graph)
+    @staticmethod
+    def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Ensure an undirected graph with the same relationships will always be read the same way.
+        通过对节点和边进行排序来实现
+        """
+        fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
+        sorted_nodes = graph.nodes(data=True)
+        sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
+        fixed_graph.add_nodes_from(sorted_nodes)
+        edges = list(graph.edges(data=True))
+        if not graph.is_directed():
+            def _sort_source_target(edge):
+                source, target, edge_data = edge
+                if source > target:
+                    source, target = target, source
+                return source, target, edge_data
+            edges = [_sort_source_target(edge) for edge in edges]
+        def _get_edge_key(source: Any, target: Any) -> str:
+            return f"{source} -> {target}"
+        edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
+        fixed_graph.add_edges_from(edges)
+        return fixed_graph
+    def __post_init__(self):
+        """
+        如果图文件存在，则加载图文件，否则创建一个新图
+        """
+        self._graphml_xml_file = os.path.join(
+            self.working_dir, f"{self.namespace}.graphml"
+        )
+        preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
+        if preloaded_graph is not None:
+            logger.info(
+                "Loaded graph from %s with %d nodes, %d edges", self._graphml_xml_file,
+                preloaded_graph.number_of_nodes(), preloaded_graph.number_of_edges()
+            )
+        self._graph = preloaded_graph or nx.Graph()
+    async def index_done_callback(self):
+        NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file)
+    async def has_node(self, node_id: str) -> bool:
+        return self._graph.has_node(node_id)
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        return self._graph.has_edge(source_node_id, target_node_id)
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        return self._graph.nodes.get(node_id)
+    async def get_all_nodes(self) -> Union[list[dict], None]:
+        return self._graph.nodes(data=True)
+    async def node_degree(self, node_id: str) -> int:
+        return self._graph.degree(node_id)
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return self._graph.degree(src_id) + self._graph.degree(tgt_id)
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        return self._graph.edges.get((source_node_id, target_node_id))
+    async def get_all_edges(self) -> Union[list[dict], None]:
+        return self._graph.edges(data=True)
+    async def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
+        if self._graph.has_node(source_node_id):
+            return list(self._graph.edges(source_node_id, data=True))
+        return None
+    async def get_graph(self) -> nx.Graph:
+        return self._graph
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        self._graph.add_node(node_id, **node_data)
+    async def update_node(self, node_id: str, node_data: dict[str, str]):
+        if self._graph.has_node(node_id):
+            self._graph.nodes[node_id].update(node_data)
+        else:
+            logger.warning("Node %s not found in the graph for update.", node_id)
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        self._graph.add_edge(source_node_id, target_node_id, **edge_data)
+    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
+        if self._graph.has_edge(source_node_id, target_node_id):
+            self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
+        else:
+            logger.warning("Edge %s -> %s not found in the graph for update.", source_node_id, target_node_id)
+    async def delete_node(self, node_id: str):
+        """
+        Delete a node from the graph based on the specified node_id.
+        :param node_id: The node_id to delete
+        """
+        if self._graph.has_node(node_id):
+            self._graph.remove_node(node_id)
+            logger.info("Node %s deleted from the graph.", node_id)
+        else:
+            logger.warning("Node %s not found in the graph for deletion.", node_id)
+    async def clear(self):
+        """
+        Clear the graph by removing all nodes and edges.
+        """
+        self._graph.clear()
+        logger.info("Graph %s cleared.", self.namespace)

graphgen/models/strategy/__init__.py ADDED Viewed

File without changes

graphgen/models/strategy/base_strategy.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from dataclasses import dataclass
+@dataclass
+class BaseStrategy:
+    pass

graphgen/models/strategy/travserse_strategy.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from dataclasses import dataclass, fields
+from graphgen.models.strategy.base_strategy import BaseStrategy
+@dataclass
+class TraverseStrategy(BaseStrategy):
+    # 生成的QA形式：原子、多跳、聚合型
+    qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
+    # 最大边数和最大token数方法中选择一个生效
+    expand_method: str = "max_tokens" # "max_width" or "max_tokens"
+    # 单向拓展还是双向拓展
+    bidirectional: bool = True
+    # 每个方向拓展的最大边数
+    max_extra_edges: int = 5
+    # 最长token数
+    max_tokens: int = 256
+    # 每个方向拓展的最大深度
+    max_depth: int = 2
+    # 同一层中选边的策略（如果是双向拓展，同一层指的是两边连接的边的集合）
+    edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
+    # 孤立节点的处理策略
+    isolated_node_strategy: str = "add" # "add" or "ignore"
+    loss_strategy: str = "only_edge"  # only_edge, both
+    def to_yaml(self):
+        strategy_dict = {}
+        for f in fields(self):
+            strategy_dict[f.name] = getattr(self, f.name)
+        return {"traverse_strategy": strategy_dict}

graphgen/models/text/__init__.py ADDED Viewed

File without changes

graphgen/models/text/chunk.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from dataclasses import dataclass
+@dataclass
+class Chunk:
+    id : str
+    content: str

graphgen/models/text/text_pair.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from dataclasses import dataclass
+@dataclass
+class TextPair:
+    """
+    A pair of input data.
+    """
+    question: str
+    answer: str

graphgen/operators/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .extract_kg import extract_kg
+from .quiz import quiz
+from .judge import judge_statement, skip_judge_statement
+from .search_wikipedia import search_wikipedia
+from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically, traverse_graph_for_multi_hop
+__all__ = [
+    "extract_kg",
+    "quiz",
+    "judge_statement",
+    "skip_judge_statement",
+    "search_wikipedia",
+    "traverse_graph_by_edge",
+    "traverse_graph_atomically",
+    "traverse_graph_for_multi_hop"
+]

graphgen/operators/extract_kg.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import re
+import asyncio
+from typing import List
+from collections import defaultdict
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import Chunk, OpenAIModel, Tokenizer
+from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.templates import KG_EXTRACTION_PROMPT
+from graphgen.utils import (logger, pack_history_conversations, split_string_by_multi_markers,
+                            handle_single_entity_extraction, handle_single_relationship_extraction,
+                            detect_if_chinese)
+from graphgen.operators.merge_kg import merge_nodes, merge_edges
+# pylint: disable=too-many-statements
+async def extract_kg(
+        llm_client: OpenAIModel,
+        kg_instance: BaseGraphStorage,
+        tokenizer_instance: Tokenizer,
+        chunks: List[Chunk],
+        progress_bar: gr.Progress = None,
+        max_concurrent: int = 1000
+):
+    """
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param tokenizer_instance
+    :param chunks
+    :param progress_bar: Gradio progress bar to show the progress of the extraction
+    :param max_concurrent
+    :return:
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _process_single_content(chunk: Chunk, max_loop: int = 3):
+        async with semaphore:
+            chunk_id = chunk.id
+            content = chunk.content
+            if detect_if_chinese(content):
+                language = "Chinese"
+            else:
+                language = "English"
+            KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+            hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
+                **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
+            )
+            final_result = await llm_client.generate_answer(hint_prompt)
+            logger.info('First result: %s', final_result)
+            history = pack_history_conversations(hint_prompt, final_result)
+            for loop_index in range(max_loop):
+                if_loop_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"],
+                    history=history
+                )
+                if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+                if if_loop_result != "yes":
+                    break
+                glean_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"],
+                    history=history
+                )
+                logger.info('Loop %s glean: %s', loop_index, glean_result)
+                history += pack_history_conversations(KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result)
+                final_result += glean_result
+                if loop_index == max_loop - 1:
+                    break
+            records = split_string_by_multi_markers(
+                final_result,
+                [
+                KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"]],
+            )
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+            for record in records:
+                record = re.search(r"\((.*)\)", record)
+                if record is None:
+                    continue
+                record = record.group(1) # 提取括号内的内容
+                record_attributes = split_string_by_multi_markers(
+                    record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+                entity = await handle_single_entity_extraction(record_attributes, chunk_id)
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+                relation = await handle_single_relationship_extraction(record_attributes, chunk_id)
+                if relation is not None:
+                    edges[(relation["src_id"], relation["tgt_id"])].append(relation)
+            return dict(nodes), dict(edges)
+    results = []
+    chunk_number = len(chunks)
+    async for result in tqdm_async(
+        asyncio.as_completed([_process_single_content(c) for c in chunks]),
+        total=len(chunks),
+        desc="[3/4]Extracting entities and relationships from chunks",
+        unit="chunk",
+    ):
+        try:
+            if progress_bar is not None:
+                progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks")
+            results.append(await result)
+            if progress_bar is not None and len(results) == chunk_number:
+                progress_bar(1, desc="[3/4]Extracting entities and relationships from chunks")
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while extracting entities and relationships from chunks: %s", e)
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+    await merge_nodes(nodes, kg_instance, llm_client, tokenizer_instance)
+    await merge_edges(edges, kg_instance, llm_client, tokenizer_instance)
+    return kg_instance

graphgen/operators/judge.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import math
+import asyncio
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import NetworkXStorage, OpenAIModel, JsonKVStorage
+from graphgen.utils import logger, yes_no_loss_entropy
+from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
+async def judge_statement( # pylint: disable=too-many-statements
+        trainee_llm_client: OpenAIModel,
+        graph_storage: NetworkXStorage,
+        rephrase_storage: JsonKVStorage,
+        re_judge: bool = False,
+        max_concurrent: int = 1000) -> NetworkXStorage:
+    """
+    Get all edges and nodes and judge them
+    :param trainee_llm_client: judge the statements to get comprehension loss
+    :param graph_storage: graph storage instance
+    :param rephrase_storage: rephrase storage instance
+    :param re_judge: re-judge the relations
+    :param max_concurrent: max concurrent
+    :return:
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _judge_single_relation(
+        edge: tuple,
+    ):
+        async with semaphore:
+            source_id = edge[0]
+            target_id = edge[1]
+            edge_data = edge[2]
+            if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
+                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
+                return source_id, target_id, edge_data
+            description = edge_data["description"]
+            try:
+                descriptions = await rephrase_storage.get_by_id(description)
+                assert descriptions is not None
+                judgements = []
+                gts = [gt for _, gt in descriptions]
+                for description, gt in descriptions:
+                    judgement = await trainee_llm_client.generate_topk_per_token(
+                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
+                    )
+                    judgements.append(judgement[0].top_candidates)
+                loss = yes_no_loss_entropy(judgements, gts)
+                logger.info("Edge %s -> %s description: %s loss: %s", source_id, target_id, description, loss)
+                edge_data["loss"] = loss
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error in judging relation %s -> %s: %s", source_id, target_id, e)
+                logger.info("Use default loss 0.1")
+                edge_data["loss"] = -math.log(0.1)
+            await graph_storage.update_edge(source_id, target_id, edge_data)
+            return source_id, target_id, edge_data
+    edges = await graph_storage.get_all_edges()
+    results = []
+    for result in tqdm_async(
+            asyncio.as_completed([_judge_single_relation(edge) for edge in edges]),
+            total=len(edges),
+            desc="Judging relations"
+    ):
+        results.append(await result)
+    async def _judge_single_entity(
+        node: tuple,
+    ):
+        async with semaphore:
+            node_id = node[0]
+            node_data = node[1]
+            if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
+                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
+                return node_id, node_data
+            description = node_data["description"]
+            try:
+                descriptions = await rephrase_storage.get_by_id(description)
+                assert descriptions is not None
+                judgements = []
+                gts = [gt for _, gt in descriptions]
+                for description, gt in descriptions:
+                    judgement = await trainee_llm_client.generate_topk_per_token(
+                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
+                    )
+                    judgements.append(judgement[0].top_candidates)
+                loss = yes_no_loss_entropy(judgements, gts)
+                logger.info("Node %s description: %s loss: %s", node_id, description, loss)
+                node_data["loss"] = loss
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error in judging entity %s: %s", node_id, e)
+                logger.info("Use default loss 0.1")
+                node_data["loss"] = -math.log(0.1)
+            await graph_storage.update_node(node_id, node_data)
+            return node_id, node_data
+    nodes = await graph_storage.get_all_nodes()
+    results = []
+    for result in tqdm_async(
+            asyncio.as_completed([_judge_single_entity(node) for node in nodes]),
+            total=len(nodes),
+            desc="Judging entities"
+    ):
+        results.append(await result)
+    return graph_storage
+async def skip_judge_statement(
+        graph_storage: NetworkXStorage,
+        max_concurrent: int = 1000
+):
+    """
+    Skip the judgement of the statement
+    :param graph_storage: graph storage instance
+    :param max_concurrent: max concurrent
+    :return:
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _skip_single_relation(
+        edge: tuple,
+    ):
+        async with semaphore:
+            source_id = edge[0]
+            target_id = edge[1]
+            edge_data = edge[2]
+            if "loss" in edge_data and edge_data["loss"] is not None:
+                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
+                return source_id, target_id, edge_data
+            edge_data["loss"] = -math.log(0.1)
+            await graph_storage.update_edge(source_id, target_id, edge_data)
+            return source_id, target_id, edge_data
+    edges = await graph_storage.get_all_edges()
+    results = []
+    for result in tqdm_async(
+            asyncio.as_completed([_skip_single_relation(edge) for edge in edges]),
+            total=len(edges),
+            desc="Skipping judgement of relations"
+    ):
+        results.append(await result)
+    async def _skip_single_entity(
+        node: tuple,
+    ):
+        async with semaphore:
+            node_id = node[0]
+            node_data = node[1]
+            if "loss" in node_data and node_data["loss"] is not None:
+                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
+                return node_id, node_data
+            node_data["loss"] = -math.log(0.1)
+            await graph_storage.update_node(node_id, node_data)
+            return node_id, node_data
+    nodes = await graph_storage.get_all_nodes()
+    results = []
+    for result in tqdm_async(
+            asyncio.as_completed([_skip_single_entity(node) for node in nodes]),
+            total=len(nodes),
+            desc="Skipping judgement of entities"
+    ):
+        results.append(await result)
+    return graph_storage

graphgen/operators/merge_kg.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from collections import Counter
+import asyncio
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.utils.format import split_string_by_multi_markers
+from graphgen.utils import logger, detect_main_language
+from graphgen.models import TopkTokenModel, Tokenizer
+from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.templates import KG_SUMMARIZATION_PROMPT, KG_EXTRACTION_PROMPT
+async def _handle_kg_summary(
+    entity_or_relation_name: str,
+    description: str,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_summary_tokens: int = 200
+) -> str:
+    """
+    处理实体或关系的描述信息
+    :param entity_or_relation_name
+    :param description
+    :param llm_client
+    :param tokenizer_instance
+    :param max_summary_tokens
+    :return: new description
+    """
+    language = detect_main_language(description)
+    if language == "en":
+        language = "English"
+    else:
+        language = "Chinese"
+    KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+    tokens = tokenizer_instance.encode_string(description)
+    if len(tokens) <  max_summary_tokens:
+        return description
+    use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens])
+    prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
+        entity_name=entity_or_relation_name,
+        description_list=use_description.split('<SEP>'),
+        **KG_SUMMARIZATION_PROMPT["FORMAT"]
+    )
+    new_description = await llm_client.generate_answer(prompt)
+    logger.info("Entity or relation %s summary: %s", entity_or_relation_name, new_description)
+    return new_description
+async def merge_nodes(
+    nodes_data: dict,
+    kg_instance: BaseGraphStorage,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000
+):
+    """
+    Merge nodes
+    :param nodes_data
+    :param kg_instance
+    :param llm_client
+    :param tokenizer_instance
+    :param max_concurrent
+    :return
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_single_node(entity_name: str, node_data: list[dict]):
+        async with semaphore:
+            entity_types = []
+            source_ids = []
+            descriptions = []
+            node = await kg_instance.get_node(entity_name)
+            if node is not None:
+                entity_types.append(node["entity_type"])
+                source_ids.extend(
+                    split_string_by_multi_markers(node["source_id"], ['<SEP>'])
+                )
+                descriptions.append(node["description"])
+            # 统计当前节点数据和已有节点数据的entity_type出现次数，取出现次数最多的entity_type
+            entity_type = sorted(
+                Counter(
+                    [dp["entity_type"] for dp in node_data] + entity_types
+                ).items(),
+                key=lambda x: x[1],
+                reverse=True,
+            )[0][0]
+            description = '<SEP>'.join(
+                sorted(set([dp["description"] for dp in node_data] + descriptions))
+            )
+            description = await _handle_kg_summary(
+                entity_name, description, llm_client, tokenizer_instance
+            )
+            source_id = '<SEP>'.join(
+                set([dp["source_id"] for dp in node_data] + source_ids)
+            )
+            node_data = {
+                "entity_type": entity_type,
+                "description": description,
+                "source_id": source_id
+            }
+            await kg_instance.upsert_node(
+                entity_name,
+                node_data=node_data
+            )
+            node_data["entity_name"] = entity_name
+            return node_data
+    logger.info("Inserting entities into storage...")
+    entities_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [process_single_node(k, v) for k, v in nodes_data.items()]
+        ),
+        total=len(nodes_data),
+        desc="Inserting entities into storage",
+        unit="entity",
+    ):
+        try:
+            entities_data.append(await result)
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while inserting entities into storage: %s", e)
+async def merge_edges(
+    edges_data: dict,
+    kg_instance: BaseGraphStorage,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000
+):
+    """
+    Merge edges
+    :param edges_data
+    :param kg_instance
+    :param llm_client
+    :param tokenizer_instance
+    :param max_concurrent
+    :return
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]):
+        async with semaphore:
+            source_ids = []
+            descriptions = []
+            edge = await kg_instance.get_edge(src_id, tgt_id)
+            if edge is not None:
+                source_ids.extend(
+                    split_string_by_multi_markers(edge["source_id"], ['<SEP>'])
+                )
+                descriptions.append(edge["description"])
+            description = '<SEP>'.join(
+                sorted(set([dp["description"] for dp in edge_data] + descriptions))
+            )
+            source_id = '<SEP>'.join(
+                set([dp["source_id"] for dp in edge_data] + source_ids)
+            )
+            for insert_id in [src_id, tgt_id]:
+                if not await kg_instance.has_node(insert_id):
+                    await kg_instance.upsert_node(
+                        insert_id,
+                        node_data={
+                            "source_id": source_id,
+                            "description": description,
+                            "entity_type": "UNKNOWN"
+                        }
+                    )
+            description = await _handle_kg_summary(
+                f"({src_id}, {tgt_id})", description, llm_client, tokenizer_instance
+            )
+            await kg_instance.upsert_edge(
+                src_id,
+                tgt_id,
+                edge_data={
+                    "source_id": source_id,
+                    "description": description
+                }
+            )
+            edge_data = {
+                "src_id": src_id,
+                "tgt_id": tgt_id,
+                "description": description
+            }
+            return edge_data
+    logger.info("Inserting relationships into storage...")
+    relationships_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [process_single_edge(src_id, tgt_id, v) for (src_id, tgt_id), v in edges_data.items()]
+        ),
+        total=len(edges_data),
+        desc="Inserting relationships into storage",
+        unit="relationship",
+    ):
+        try:
+            relationships_data.append(await result)
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while inserting relationships into storage: %s", e)

graphgen/operators/quiz.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import asyncio
+from collections import defaultdict
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import JsonKVStorage, OpenAIModel, NetworkXStorage
+from graphgen.utils import logger, detect_main_language
+from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
+async def quiz(
+        synth_llm_client: OpenAIModel,
+        graph_storage: NetworkXStorage,
+        rephrase_storage: JsonKVStorage,
+        max_samples: int = 1,
+        max_concurrent: int = 1000) -> JsonKVStorage:
+    """
+    Get all edges and quiz them
+    :param synth_llm_client: generate statements
+    :param graph_storage: graph storage instance
+    :param rephrase_storage: rephrase storage instance
+    :param max_samples: max samples for each edge
+    :param max_concurrent: max concurrent
+    :return:
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _process_single_quiz(
+        des: str,
+        prompt: str,
+        gt: str
+    ):
+        async with semaphore:
+            try:
+                # 如果在rephrase_storage中已经存在，直接取出
+                descriptions = await rephrase_storage.get_by_id(des)
+                if descriptions:
+                    return None
+                new_description = await synth_llm_client.generate_answer(
+                    prompt,
+                    temperature=1
+                )
+                return  {des: [(new_description, gt)]}
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error when quizzing description %s: %s", des, e)
+                return None
+    edges = await graph_storage.get_all_edges()
+    nodes = await graph_storage.get_all_nodes()
+    results = defaultdict(list)
+    tasks = []
+    for edge in edges:
+        edge_data = edge[2]
+        description = edge_data["description"]
+        language = "English" if detect_main_language(description) == "en" else "Chinese"
+        results[description] = [(description, 'yes')]
+        for i in range(max_samples):
+            if i > 0:
+                tasks.append(
+                    _process_single_quiz(description,
+                                          DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(
+                                              input_sentence=description), 'yes')
+                )
+            tasks.append(_process_single_quiz(description,
+                                              DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(
+                                                  input_sentence=description), 'no'))
+    for node in nodes:
+        node_data = node[1]
+        description = node_data["description"]
+        language = "English" if detect_main_language(description) == "en" else "Chinese"
+        results[description] = [(description, 'yes')]
+        for i in range(max_samples):
+            if i > 0:
+                tasks.append(
+                    _process_single_quiz(description,
+                                          DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(
+                                              input_sentence=description), 'yes')
+                )
+            tasks.append(_process_single_quiz(description,
+                                              DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(
+                                                  input_sentence=description), 'no'))
+    for result in tqdm_async(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc="Quizzing descriptions"
+    ):
+        new_result = await result
+        if new_result:
+            for key, value in new_result.items():
+                results[key].extend(value)
+    for key, value in results.items():
+        results[key] = list(set(value))
+        await rephrase_storage.upsert({key: results[key]})
+    return rephrase_storage

graphgen/operators/resolute_coreference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import List
+from graphgen.models import Chunk
+from graphgen.models import OpenAIModel
+from graphgen.templates import COREFERENCE_RESOLUTION_TEMPLATE
+from graphgen.utils import detect_main_language
+async def resolute_coreference(
+        llm_client: OpenAIModel,
+        chunks: List[Chunk]) -> List[Chunk]:
+    """
+    Resolute conference
+    :param llm_client: LLM model
+    :param chunks: List of chunks
+    :return: List of chunks
+    """
+    if len(chunks) == 0:
+        return chunks
+    results = [chunks[0]]
+    for _, chunk in enumerate(chunks[1:]):
+        language = detect_main_language(chunk.content)
+        result = await llm_client.generate_answer(
+            COREFERENCE_RESOLUTION_TEMPLATE[language].format(
+                reference = results[0].content,
+                input_sentence = chunk.content
+            )
+        )
+        results.append(Chunk(id=chunk.id, content=result))
+    return results

graphgen/operators/search_wikipedia.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import asyncio
+from graphgen.models import WikiSearch, OpenAIModel
+from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.templates import SEARCH_JUDGEMENT_PROMPT
+from graphgen.utils import logger
+async def _process_single_entity(entity_name: str,
+                                 description: str,
+                                 llm_client: OpenAIModel,
+                                 wiki_search_client: WikiSearch) -> tuple[str, None] | tuple[str, str]:
+    """
+    Process single entity
+    """
+    search_results = await wiki_search_client.search(entity_name)
+    if not search_results:
+        return entity_name, None
+    examples = "\n".join(SEARCH_JUDGEMENT_PROMPT["EXAMPLES"])
+    search_results.append("None of the above")
+    search_results_str = "\n".join([f"{i + 1}. {sr}" for i, sr in enumerate(search_results)])
+    prompt = SEARCH_JUDGEMENT_PROMPT["TEMPLATE"].format(
+        examples=examples,
+        entity_name=entity_name,
+        description=description,
+        search_results=search_results_str,
+    )
+    response = await llm_client.generate_answer(prompt)
+    try:
+        response = response.strip()
+        response = int(response)
+        if response < 1 or response >= len(search_results):
+            response = None
+        else:
+            response = await wiki_search_client.summary(search_results[response - 1])
+    except ValueError:
+        response = None
+    logger.info("Entity %s search result: %s response: %s", entity_name, str(search_results), response)
+    return entity_name, response
+async def search_wikipedia(llm_client: OpenAIModel,
+                           wiki_search_client: WikiSearch,
+                           knowledge_graph_instance: BaseGraphStorage,) -> dict:
+    """
+    Search wikipedia for entities
+    :param llm_client: LLM model
+    :param wiki_search_client: wiki search client
+    :param knowledge_graph_instance: knowledge graph instance
+    :return: nodes with search results
+    """
+    nodes = await knowledge_graph_instance.get_all_nodes()
+    nodes = list(nodes)
+    wiki_data = {}
+    tasks = [
+        _process_single_entity(node[0].strip('"'), node[1]["description"], llm_client, wiki_search_client)
+        for node in nodes
+    ]
+    for task in asyncio.as_completed(tasks):
+        result = await task
+        wiki_data[result[0]] = result[1]
+    return wiki_data

graphgen/operators/split_graph.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import random
+from collections import defaultdict
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.utils import logger
+from graphgen.models import NetworkXStorage, TraverseStrategy
+async def _get_node_info(
+    node_id: str,
+    graph_storage: NetworkXStorage,
+)-> dict:
+    """
+    Get node info
+    :param node_id: node id
+    :param graph_storage: graph storage instance
+    :return: node info
+    """
+    node_data = await graph_storage.get_node(node_id)
+    return {
+        "node_id": node_id,
+        **node_data
+    }
+def _get_level_n_edges_by_max_width(
+    edge_adj_list: dict,
+    node_dict: dict,
+    edges: list,
+    nodes,
+    src_edge: tuple,
+    max_depth: int,
+    bidirectional: bool,
+    max_extra_edges: int,
+    edge_sampling: str,
+    loss_strategy: str = "only_edge"
+) -> list:
+    """
+    Get level n edges for an edge.
+    n is decided by max_depth in traverse_strategy
+    :param edge_adj_list
+    :param node_dict
+    :param edges
+    :param nodes
+    :param src_edge
+    :param max_depth
+    :param bidirectional
+    :param max_extra_edges
+    :param edge_sampling
+    :return: level n edges
+    """
+    src_id, tgt_id, _ = src_edge
+    level_n_edges = []
+    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
+    while max_depth > 0 and max_extra_edges > 0:
+        max_depth -= 1
+        candidate_edges = [
+            edges[edge_id]
+            for node in start_nodes
+            for edge_id in edge_adj_list[node]
+            if not edges[edge_id][2].get("visited", False)
+        ]
+        if not candidate_edges:
+            break
+        if len(candidate_edges) >= max_extra_edges:
+            if loss_strategy == "both":
+                er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
+                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[:max_extra_edges]
+            elif loss_strategy == "only_edge":
+                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[:max_extra_edges]
+            else:
+                raise ValueError(f"Invalid loss strategy: {loss_strategy}")
+            for edge in candidate_edges:
+                level_n_edges.append(edge)
+                edge[2]["visited"] = True
+            break
+        max_extra_edges -= len(candidate_edges)
+        new_start_nodes = set()
+        for edge in candidate_edges:
+            level_n_edges.append(edge)
+            edge[2]["visited"] = True
+            if not edge[0] in start_nodes:
+                new_start_nodes.add(edge[0])
+            if not edge[1] in start_nodes:
+                new_start_nodes.add(edge[1])
+        start_nodes = new_start_nodes
+    return level_n_edges
+def _get_level_n_edges_by_max_tokens(
+        edge_adj_list: dict,
+        node_dict: dict,
+        edges: list,
+        nodes: list,
+        src_edge: tuple,
+        max_depth: int,
+        bidirectional: bool,
+        max_tokens: int,
+        edge_sampling: str,
+        loss_strategy: str = "only_edge"
+) -> list:
+    """
+    Get level n edges for an edge.
+    n is decided by max_depth in traverse_strategy.
+    :param edge_adj_list
+    :param node_dict
+    :param edges
+    :param nodes
+    :param src_edge
+    :param max_depth
+    :param bidirectional
+    :param max_tokens
+    :param edge_sampling
+    :return: level n edges
+    """
+    src_id, tgt_id, src_edge_data = src_edge
+    max_tokens -= (src_edge_data["length"] + nodes[node_dict[src_id]][1]["length"]
+                   + nodes[node_dict[tgt_id]][1]["length"])
+    level_n_edges = []
+    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
+    temp_nodes = {src_id, tgt_id}
+    while max_depth > 0 and max_tokens > 0:
+        max_depth -= 1
+        candidate_edges = [
+            edges[edge_id]
+            for node in start_nodes
+            for edge_id in edge_adj_list[node]
+            if not edges[edge_id][2].get("visited", False)
+        ]
+        if not candidate_edges:
+            break
+        if loss_strategy == "both":
+            er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
+            candidate_edges = _sort_tuples(er_tuples, edge_sampling)
+        elif loss_strategy == "only_edge":
+            candidate_edges = _sort_edges(candidate_edges, edge_sampling)
+        else:
+            raise ValueError(f"Invalid loss strategy: {loss_strategy}")
+        for edge in candidate_edges:
+            max_tokens -= edge[2]["length"]
+            if not edge[0] in temp_nodes:
+                max_tokens -= nodes[node_dict[edge[0]]][1]["length"]
+            if not edge[1] in temp_nodes:
+                max_tokens -= nodes[node_dict[edge[1]]][1]["length"]
+            if max_tokens < 0:
+                return level_n_edges
+            level_n_edges.append(edge)
+            edge[2]["visited"] = True
+            temp_nodes.add(edge[0])
+            temp_nodes.add(edge[1])
+        new_start_nodes = set()
+        for edge in candidate_edges:
+            if not edge[0] in start_nodes:
+                new_start_nodes.add(edge[0])
+            if not edge[1] in start_nodes:
+                new_start_nodes.add(edge[1])
+        start_nodes = new_start_nodes
+    return level_n_edges
+def _sort_tuples(er_tuples: list, edge_sampling: str) -> list:
+    """
+    Sort edges with edge sampling strategy
+    :param er_tuples: [(nodes:list, edge:tuple)]
+    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+    :return: sorted edges
+    """
+    if edge_sampling == "random":
+        er_tuples = random.sample(er_tuples, len(er_tuples))
+    elif edge_sampling == "min_loss":
+        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"])
+    elif edge_sampling == "max_loss":
+        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
+                           reverse=True)
+    else:
+        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+    edges = [edge for _, edge in er_tuples]
+    return edges
+def _sort_edges(edges: list, edge_sampling: str) -> list:
+    """
+    Sort edges with edge sampling strategy
+    :param edges: total edges
+    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+    :return: sorted edges
+    """
+    if edge_sampling == "random":
+        random.shuffle(edges)
+    elif edge_sampling == "min_loss":
+        edges = sorted(edges, key=lambda x: x[2]["loss"])
+    elif edge_sampling == "max_loss":
+        edges = sorted(edges, key=lambda x: x[2]["loss"], reverse=True)
+    else:
+        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+    return edges
+async def get_batches_with_strategy( # pylint: disable=too-many-branches
+    nodes: list,
+    edges: list,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy
+):
+    expand_method = traverse_strategy.expand_method
+    if expand_method == "max_width":
+        logger.info("Using max width strategy")
+    elif expand_method == "max_tokens":
+        logger.info("Using max tokens strategy")
+    else:
+        raise ValueError(f"Invalid expand method: {expand_method}")
+    max_depth = traverse_strategy.max_depth
+    edge_sampling = traverse_strategy.edge_sampling
+    # 构建临接矩阵
+    edge_adj_list = defaultdict(list)
+    node_dict = {}
+    processing_batches = []
+    node_cache = {}
+    async def get_cached_node_info(node_id: str) -> dict:
+        if node_id not in node_cache:
+            node_cache[node_id] = await _get_node_info(node_id, graph_storage)
+        return node_cache[node_id]
+    for i, (node_name, _) in enumerate(nodes):
+        node_dict[node_name] = i
+    if traverse_strategy.loss_strategy == "both":
+        er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in edges]
+        edges = _sort_tuples(er_tuples, edge_sampling)
+    elif traverse_strategy.loss_strategy == "only_edge":
+        edges = _sort_edges(edges, edge_sampling)
+    else:
+        raise ValueError(f"Invalid loss strategy: {traverse_strategy.loss_strategy}")
+    for i, (src, tgt, _) in enumerate(edges):
+        edge_adj_list[src].append(i)
+        edge_adj_list[tgt].append(i)
+    for edge in tqdm_async(edges, desc="Preparing batches"):
+        if "visited" in edge[2] and edge[2]["visited"]:
+            continue
+        edge[2]["visited"] = True
+        _process_nodes = []
+        _process_edges = []
+        src_id = edge[0]
+        tgt_id = edge[1]
+        _process_nodes.extend([await get_cached_node_info(src_id),
+                               await get_cached_node_info(tgt_id)])
+        _process_edges.append(edge)
+        if expand_method == "max_width":
+            level_n_edges = _get_level_n_edges_by_max_width(
+                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
+                traverse_strategy.bidirectional, traverse_strategy.max_extra_edges,
+                edge_sampling, traverse_strategy.loss_strategy
+            )
+        else:
+            level_n_edges = _get_level_n_edges_by_max_tokens(
+                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
+                traverse_strategy.bidirectional, traverse_strategy.max_tokens,
+                edge_sampling, traverse_strategy.loss_strategy
+            )
+        for _edge in level_n_edges:
+            _process_nodes.append(await get_cached_node_info(_edge[0]))
+            _process_nodes.append(await get_cached_node_info(_edge[1]))
+            _process_edges.append(_edge)
+        # 去重
+        _process_nodes = list({node['node_id']: node for node in _process_nodes}.values())
+        _process_edges = list({(edge[0], edge[1]): edge for edge in _process_edges}.values())
+        processing_batches.append((_process_nodes, _process_edges))
+    logger.info("Processing batches: %d", len(processing_batches))
+    # isolate nodes
+    isolated_node_strategy = traverse_strategy.isolated_node_strategy
+    if isolated_node_strategy == "add":
+        processing_batches = await _add_isolated_nodes(nodes, processing_batches, graph_storage)
+        logger.info("Processing batches after adding isolated nodes: %d", len(processing_batches))
+    return processing_batches
+async def _add_isolated_nodes(
+        nodes: list,
+        processing_batches: list,
+        graph_storage: NetworkXStorage,
+) -> list:
+    visited_nodes = set()
+    for _process_nodes, _process_edges in processing_batches:
+        for node in _process_nodes:
+            visited_nodes.add(node["node_id"])
+    for node in nodes:
+        if node[0] not in visited_nodes:
+            _process_nodes = [await _get_node_info(node[0], graph_storage)]
+            processing_batches.append((_process_nodes, []))
+    return processing_batches

graphgen/operators/traverse_graph.py ADDED Viewed

	@@ -0,0 +1,485 @@

+import asyncio
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage
+from graphgen.templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT
+from graphgen.utils import detect_main_language, compute_content_hash, logger
+from graphgen.operators.split_graph import get_batches_with_strategy
+async def _pre_tokenize(graph_storage: NetworkXStorage,
+                        tokenizer: Tokenizer,
+                        edges: list,
+                        nodes: list) -> tuple:
+    sem = asyncio.Semaphore(1000)
+    async def handle_edge(edge: tuple) -> tuple:
+        async with sem:
+            if 'length' not in edge[2]:
+                edge[2]['length'] = len(
+                    await asyncio.get_event_loop().run_in_executor(None,
+                                                                   tokenizer.encode_string,
+                                                                   edge[2]['description']))
+            return edge
+    async def handle_node(node: dict) -> dict:
+        async with sem:
+            if 'length' not in node[1]:
+                node[1]['length'] = len(
+                    await asyncio.get_event_loop().run_in_executor(None,
+                                                                   tokenizer.encode_string,
+                                                                   node[1]['description']))
+            return node
+    new_edges = []
+    new_nodes = []
+    for result in tqdm_async(asyncio.as_completed([handle_edge(edge) for edge in edges]),
+                             total=len(edges), desc="Pre-tokenizing edges"):
+        new_edge = await result
+        await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2])
+        new_edges.append(new_edge)
+    for result in tqdm_async(asyncio.as_completed([handle_node(node) for node in nodes]),
+                             total=len(nodes), desc="Pre-tokenizing nodes"):
+        new_node = await result
+        await graph_storage.update_node(new_node[0], new_node[1])
+        new_nodes.append(new_node)
+    await graph_storage.index_done_callback()
+    return new_edges, new_nodes
+async def _construct_rephrasing_prompt(_process_nodes: list,
+                                       _process_edges: list,
+                                       text_chunks_storage: JsonKVStorage,
+                                       add_context: bool = False
+                                       ) -> str:
+    entities = [
+        f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
+    ]
+    relations = [
+        f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
+        for _process_edge in _process_edges
+    ]
+    entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
+    relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
+    language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
+    if add_context:
+        original_ids = ([node['source_id'].split('<SEP>')[0] for node in _process_nodes] +
+                        [edge[2]['source_id'].split('<SEP>')[0] for edge in _process_edges])
+        original_ids = list(set(original_ids))
+        original_text = await text_chunks_storage.get_by_ids(original_ids)
+        original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)])
+        prompt = ANSWER_REPHRASING_PROMPT[language]['CONTEXT_TEMPLATE'].format(
+            language=language,
+            original_text=original_text,
+            entities=entities_str,
+            relationships=relations_str
+        )
+        return prompt
+    prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format(
+        language=language,
+        entities=entities_str,
+        relationships=relations_str
+    )
+    return prompt
+def get_loss_tercile(losses: list) -> (float, float):
+    losses = sorted(losses)
+    q1_index = int(len(losses) * (1 / 3))
+    q2_index = int(len(losses) * (2 / 3))
+    return losses[q1_index], losses[q2_index]
+def get_average_loss(batch: tuple, loss_strategy: str) -> float:
+    if loss_strategy == "only_edge":
+        return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
+    if loss_strategy == "both":
+        return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \
+               (len(batch[0]) + len(batch[1]))
+    raise ValueError("Invalid loss strategy")
+def _post_process_synthetic_data(data):
+    block = data.split("\n\n")
+    qas = []
+    for line in block:
+        if "Question:" in line and "Answer:" in line:
+            question = line.split("Question:")[1].split("Answer:")[0].strip()
+            answer = line.split("Answer:")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+        elif "问题：" in line and "答案：" in line:
+            question = line.split("问题：")[1].split("答案：")[0].strip()
+            answer = line.split("答案：")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+        elif "问题:" in line and "回答:" in line:
+            question = line.split("问题:")[1].split("回答:")[0].strip()
+            answer = line.split("回答:")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+    return qas
+async def traverse_graph_by_edge(
+    llm_client: OpenAIModel,
+    tokenizer: Tokenizer,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
+    text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000
+) -> dict:
+    """
+    Traverse the graph
+    :param llm_client
+    :param tokenizer
+    :param graph_storage
+    :param traverse_strategy
+    :param text_chunks_storage
+    :param progress_bar
+    :param max_concurrent
+    :return: question and answer
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _process_nodes_and_edges(
+            _process_nodes: list,
+            _process_edges: list,
+    ) -> str:
+        prompt = await _construct_rephrasing_prompt(
+            _process_nodes,
+            _process_edges,
+            text_chunks_storage,
+            add_context = False
+        )
+        context = await llm_client.generate_answer(prompt)
+        # post-process the context
+        if context.startswith("Rephrased Text:"):
+            context = context[len("Rephrased Text:"):].strip()
+        elif context.startswith("重述文本:"):
+            context = context[len("重述文本:"):].strip()
+        return context
+    async def _process_single_batch(
+        _process_batch: tuple,
+        question_type: str = "single"
+    ) -> dict:
+        async with semaphore:
+            context = await _process_nodes_and_edges(
+                _process_batch[0],
+                _process_batch[1],
+            )
+            language = "Chinese" if detect_main_language(context) == "zh" else "English"
+            pre_length = sum(node['length'] for node in _process_batch[0]) \
+                         + sum(edge[2]['length'] for edge in _process_batch[1])
+            if question_type == "single":
+                question = await llm_client.generate_answer(
+                    QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format(
+                        answer=context
+                    )
+                )
+                if question.startswith("Question:"):
+                    question = question[len("Question:"):].strip()
+                elif question.startswith("问题："):
+                    question = question[len("问题："):].strip()
+                logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
+                logger.info("Pre-length: %s", pre_length)
+                logger.info("Question: %s", question)
+                logger.info("Answer: %s", context)
+                return {
+                    compute_content_hash(context): {
+                        "question": question,
+                        "answer": context,
+                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
+                    }
+                }
+            content = await llm_client.generate_answer(
+                QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format(
+                    doc=context
+                )
+            )
+            qas = _post_process_synthetic_data(content)
+            if len(qas) == 0:
+                print(content)
+                logger.error("Error occurred while processing batch, question or answer is None")
+                return {}
+            final_results = {}
+            logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
+            logger.info("Pre-length: %s", pre_length)
+            for qa in qas:
+                logger.info("Question: %s", qa['question'])
+                logger.info("Answer: %s", qa['answer'])
+                final_results[compute_content_hash(qa['question'])] = {
+                    "question": qa['question'],
+                    "answer": qa['answer'],
+                    "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
+                }
+            return final_results
+    results = {}
+    edges = list(await graph_storage.get_all_edges())
+    nodes = list(await graph_storage.get_all_nodes())
+    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
+    processing_batches = await get_batches_with_strategy(
+        nodes,
+        edges,
+        graph_storage,
+        traverse_strategy
+    )
+    for result in tqdm_async(asyncio.as_completed(
+        [_process_single_batch(batch) for batch in processing_batches]
+    ), total=len(processing_batches), desc="[4/4]Generating QAs"):
+        try:
+            if progress_bar is not None:
+                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
+            results.update(await result)
+            if progress_bar is not None and len(results) == len(processing_batches):
+                progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while generating QA: %s", e)
+    return results
+async def traverse_graph_atomically(
+    llm_client: OpenAIModel,
+    tokenizer: Tokenizer,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
+    text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000
+) -> dict:
+    """
+    Traverse the graph atomicly
+    :param llm_client
+    :param tokenizer
+    :param graph_storage
+    :param traverse_strategy
+    :param text_chunks_storage
+    :param progress_bar
+    :param max_concurrent
+    :return: question and answer
+    """
+    assert traverse_strategy.qa_form == "atomic"
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _generate_question(
+        node_or_edge: tuple
+    ):
+        if len(node_or_edge) == 2:
+            des = node_or_edge[0] + ": " + node_or_edge[1]['description']
+            loss = node_or_edge[1]['loss']
+        else:
+            des = node_or_edge[2]['description']
+            loss = node_or_edge[2]['loss']
+        async with semaphore:
+            try:
+                language = "Chinese" if detect_main_language(des) == "zh" else "English"
+                qa = await llm_client.generate_answer(
+                    QUESTION_GENERATION_PROMPT[language]['SINGLE_QA_TEMPLATE'].format(
+                        doc=des
+                    )
+                )
+                if "Question:" in qa and "Answer:" in qa:
+                    question = qa.split("Question:")[1].split("Answer:")[0].strip()
+                    answer = qa.split("Answer:")[1].strip()
+                elif "问题：" in qa and "答案：" in qa:
+                    question = qa.split("问题：")[1].split("答案：")[0].strip()
+                    answer = qa.split("答案：")[1].strip()
+                else:
+                    return {}
+                question = question.strip("\"")
+                answer = answer.strip("\"")
+                logger.info("Question: %s", question)
+                logger.info("Answer: %s", answer)
+                return {
+                    compute_content_hash(question): {
+                        "question": question,
+                        "answer": answer,
+                        "loss": loss
+                    }
+                }
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error occurred while generating question: %s", e)
+                return {}
+    results = {}
+    edges = list(await graph_storage.get_all_edges())
+    nodes = list(await graph_storage.get_all_nodes())
+    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
+    tasks = []
+    for node in nodes:
+        if "<SEP>" in node[1]['description']:
+            description_list = node[1]['description'].split("<SEP>")
+            for item in description_list:
+                tasks.append((node[0], {"description": item, 'loss': node[1]['loss']}))
+        else:
+            tasks.append((node[0], node[1]))
+    for edge in edges:
+        if "<SEP>" in edge[2]['description']:
+            description_list = edge[2]['description'].split("<SEP>")
+            for item in description_list:
+                tasks.append((edge[0], edge[1], {"description": item, 'loss': edge[2]['loss']}))
+        else:
+            tasks.append((edge[0], edge[1], edge[2]))
+    for result in tqdm_async(
+        asyncio.as_completed([_generate_question(task) for task in tasks]),
+        total=len(tasks),
+        desc="[4/4]Generating QAs"
+    ):
+        try:
+            if progress_bar is not None:
+                progress_bar(len(results) / len(tasks), desc="[4/4]Generating QAs")
+            results.update(await result)
+            if progress_bar is not None and len(results) == len(tasks):
+                progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while generating QA: %s", e)
+    return results
+async def traverse_graph_for_multi_hop(
+    llm_client: OpenAIModel,
+    tokenizer: Tokenizer,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
+    text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000
+) -> dict:
+    """
+    Traverse the graph for multi-hop
+    :param llm_client
+    :param tokenizer
+    :param graph_storage
+    :param traverse_strategy
+    :param text_chunks_storage
+    :param progress_bar
+    :param max_concurrent
+    :return: question and answer
+    """
+    assert traverse_strategy.qa_form == "multi_hop"
+    semaphore = asyncio.Semaphore(max_concurrent)
+    results = {}
+    edges = list(await graph_storage.get_all_edges())
+    nodes = list(await graph_storage.get_all_nodes())
+    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
+    processing_batches = await get_batches_with_strategy(
+        nodes,
+        edges,
+        graph_storage,
+        traverse_strategy
+    )
+    async def _process_single_batch(
+        _process_batch: tuple
+    ) -> dict:
+        async with semaphore:
+            try:
+                language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English"
+                _process_nodes = _process_batch[0]
+                _process_edges = _process_batch[1]
+                entities = [
+                    f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
+                ]
+                relations = [
+                    f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
+                    for _process_edge in _process_edges
+                ]
+                entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
+                relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
+                prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
+                    entities=entities_str,
+                    relationships=relations_str
+                )
+                context = await llm_client.generate_answer(prompt)
+                # post-process the context
+                if "Question:" in context and "Answer:" in context:
+                    question = context.split("Question:")[1].split("Answer:")[0].strip()
+                    answer = context.split("Answer:")[1].strip()
+                elif "问题：" in context and "答案：" in context:
+                    question = context.split("问题：")[1].split("答案：")[0].strip()
+                    answer = context.split("答案：")[1].strip()
+                else:
+                    return {}
+                question = question.strip("\"")
+                answer = answer.strip("\"")
+                logger.info("Question: %s", question)
+                logger.info("Answer: %s", answer)
+                return {
+                    compute_content_hash(question): {
+                        "question": question,
+                        "answer": answer,
+                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy),
+                    }
+                }
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error occurred while processing batch: %s", e)
+                return {}
+    async for result in tqdm_async(
+        asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
+        total=len(processing_batches),
+        desc="[4/4]Generating QAs"
+    ):
+        try:
+            if progress_bar is not None:
+                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
+            results.update(await result)
+            if progress_bar is not None and len(results) == len(processing_batches):
+                progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while generating QA: %s", e)
+    return results

graphgen/templates/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .kg_extraction import KG_EXTRACTION_PROMPT
+from .kg_summarization import KG_SUMMARIZATION_PROMPT
+from .search_judgement import SEARCH_JUDGEMENT_PROMPT
+from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
+from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
+from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
+from .question_generation import QUESTION_GENERATION_PROMPT
+from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
+from .coreference_resolution import COREFERENCE_RESOLUTION_TEMPLATE

graphgen/templates/answer_rephrasing.py ADDED Viewed

	@@ -0,0 +1,219 @@

+TEMPLATE_CONTEXT_EN: str = """---Role---
+You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements.
+Use {language} as output language.
+---Goal---
+To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
+1. Following a clear logical flow and structure
+2. Establishing proper cause-and-effect relationships
+3. Ensuring temporal and sequential consistency
+4. Creating smooth transitions between ideas using conjunctions and appropriate linking words like "firstly," "however," "therefore," etc.
+---Instructions---
+1. Analyze the provided ENTITIES and RELATIONSHIPS carefully to identify:
+   - Key concepts and their hierarchies
+   - Temporal sequences and chronological order
+   - Cause-and-effect relationships
+   - Dependencies between different elements
+2. Organize the information in a logical sequence by:
+   - Starting with foundational concepts
+   - Building up to more complex relationships
+   - Grouping related ideas together
+   - Creating clear transitions between sections
+3. Rephrase the text while maintaining:
+   - Logical flow and progression
+   - Clear connections between ideas
+   - Proper context and background
+   - Coherent narrative structure
+4. Review and refine the text to ensure:
+   - Logical consistency throughout
+   - Clear cause-and-effect relationships
+################
+-ORIGINAL TEXT-
+################
+{original_text}
+################
+-ENTITIES-
+################
+{entities}
+################
+-RELATIONSHIPS-
+################
+{relationships}
+"""
+TEMPLATE_CONTEXT_ZH: str = """---角色---
+你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成，但需要确保最终输出的文本符合要求。
+使用{language}作为输出语言。
+---目标---
+生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
+1. 遵循清晰的逻辑流和结构
+2. 建立适当的因果关系
+3. 确保时间和顺序的一致性
+4. 使用连词和适当的连接词(如"首先"、"然而"、"因此"等)创造流畅的过渡
+---说明---
+1. 仔细分析提供的实体和关系，以识别：
+    - 关键概念及其层级关系
+    - 时间序列和时间顺序
+    - 因果关系
+    - 不同元素之间的依赖关系
+2. 通过以下方式将信息组织成逻辑顺序：
+    - 从基础概念开始
+    - 逐步建立更复杂的关系
+    - 将相关的想法分组在一起
+    - 在各部分之间创建清晰的过渡
+3. 重述文本时保持：
+    - 逻辑流畅
+    - 概念之间的清晰联系
+    - 适当的上下文和背景
+    - 连贯的叙述结构
+4. 检查和完善文本以确保：
+    - 整体逻辑一致性
+    - 清晰的因果关系
+################
+-原始文本-
+################
+{original_text}
+################
+-实体-
+################
+{entities}
+################
+-关系-
+################
+{relationships}
+"""
+TEMPLATE_EN: str = """---Role---
+You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below.
+Use {language} as output language.
+---Goal---
+To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
+1. Following a clear logical flow and structure
+2. Establishing proper cause-and-effect relationships
+3. Ensuring temporal and sequential consistency
+4. Creating smooth transitions between ideas using conjunctions and appropriate linking words like "firstly," "however," "therefore," etc.
+---Instructions---
+1. Analyze the provided ENTITIES and RELATIONSHIPS carefully to identify:
+   - Key concepts and their hierarchies
+   - Temporal sequences and chronological order
+   - Cause-and-effect relationships
+   - Dependencies between different elements
+2. Organize the information in a logical sequence by:
+   - Starting with foundational concepts
+   - Building up to more complex relationships
+   - Grouping related ideas together
+   - Creating clear transitions between sections
+3. Rephrase the text while maintaining:
+   - Logical flow and progression
+   - Clear connections between ideas
+   - Proper context and background
+   - Coherent narrative structure
+4. Review and refine the text to ensure:
+   - Logical consistency throughout
+   - Clear cause-and-effect relationships
+################
+-ENTITIES-
+################
+{entities}
+################
+-RELATIONSHIPS-
+################
+{relationships}
+"""
+TEMPLATE_ZH: str = """---角色---
+你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。
+使用{language}作为输出语言。
+---目标---
+生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
+1. 遵循清晰的逻辑流和结构
+2. 建立适当的因果关系
+3. 确保时间和顺序的一致性
+4. 使用连词和适当的连接词(如"首先"、"然而"、"因此"等)创造流畅的过渡
+---说明---
+1. 仔细分析提供的实体和关系，以识别：
+    - 关键概念及其层级关系
+    - 时间序列和时间顺序
+    - 因果关系
+    - 不同元素之间的依赖关系
+2. 通过以下方式将信息组织成逻辑顺序：
+    - 从基础概念开始
+    - 逐步建立更复杂的关系
+    - 将相关的想法分组在一起
+    - 在各部分之间创建清晰的过渡
+3. 重述文本时保持：
+    - 逻辑流畅
+    - 概念之间的清晰联系
+    - 适当的上下文和背景
+    - 连贯的叙述结构
+4. 检查和完善文本以确保：
+    - 整体逻辑一致性
+    - 清晰的因果关系
+################
+-实体-
+################
+{entities}
+################
+-关系-
+################
+{relationships}
+"""
+REQUIREMENT_ZH = """
+################
+请在下方直接输出连贯的重述文本，不要输出任何额外的内容。
+重述文本:
+"""
+REQUIREMENT_EN = """
+################
+Please directly output the coherent rephrased text below, without any additional content.
+Rephrased Text:
+"""
+ANSWER_REPHRASING_PROMPT= {
+    "English": {
+        "TEMPLATE": TEMPLATE_EN + REQUIREMENT_EN,
+        "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_EN + REQUIREMENT_EN
+    },
+    "Chinese": {
+        "TEMPLATE": TEMPLATE_ZH + REQUIREMENT_ZH,
+        "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_ZH + REQUIREMENT_ZH
+    }
+}

graphgen/templates/coreference_resolution.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# pylint: disable=C0301
+TEMPLATE_ZH: str = """请根据参考文本识别并消解文本中的指代词，明确每个代词所指代的具体实体，并直接输出消解后的文本。
+-示例-
+输入：
+小明和小红一起去公园。她们玩得很开心。之后，他们去吃冰淇淋。
+输出：
+小明和小红一起去公园。小明和小红玩得很开心。之后，小明和小红去吃冰淇淋。
+-真实数据-
+参考文本：
+{reference}
+输入：
+{input_sentence}
+请直接输出改写后的句子，不要输出任何额外信息。
+输出：
+"""
+TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, specify the specific entities referred to by each pronoun, and directly output the resolved text.
+-Example-
+Input:
+John and Mary went to the park. They had a great time. Later, they went to eat ice cream.
+Output:
+John and Mary went to the park. John and Mary had a great time. Later, John and Mary went to eat ice cream.
+-Real Data-
+Reference text:
+{reference}
+Input:
+{input_sentence}
+Please directly output the rewritten sentence without any additional information.
+Output:
+"""
+COREFERENCE_RESOLUTION_TEMPLATE = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH
+}

graphgen/templates/description_rephrasing.py ADDED Viewed

	@@ -0,0 +1,121 @@

+ANTI_TEMPLATE_EN: str = """-Goal-
+Transform the input sentence into its opposite meaning while:
+1. Preserving most of the original sentence structure
+2. Changing only key words that affect the core meaning
+3. Maintaining the same tone and style
+4. The input sentence provided is a right description, and the output sentence should be a wrong description
+5. The output sentence should be fluent and grammatically correct
+################
+-Examples-
+################
+Input:
+The bright sunshine made everyone feel energetic and happy.
+Output:
+The bright sunshine made everyone feel tired and sad.
+################
+-Real Data-
+################
+Input:
+{input_sentence}
+################
+Please directly output the rewritten sentence without any additional information.
+Output:
+"""
+ANTI_TEMPLATE_ZH: str = """-目标-
+将输入句子转换为相反含义的句子，同时：
+1. 保留大部分原始句子结构
+2. 仅更改影响核心含义的关键词
+3. 保持相同的语气和风格
+4. 提供的输入句子是一个正确的描述，输出句子应该是一个错误的描述
+5. 输出句子应该流畅且语法正确
+################
+-示例-
+################
+输入：
+明亮的阳光让每个人都感到充满活力和快乐。
+输出：
+明亮的阳光让每个人都感到疲惫和悲伤。
+################
+-真实数据-
+################
+输入：
+{input_sentence}
+################
+请直接输出改写后的句子，不要输出任何额外信息。
+输出：
+"""
+TEMPLATE_ZH: str = """-目标-
+将输入句子转换为相同含义的句子，同时：
+1. 保留大部分原始句子结构
+2. 仅更改影响核心含义的关键词
+3. 保持相同的语气和风格
+4. 输出句子应该流畅且语法正确
+################
+-示例-
+################
+输入：
+明亮的阳光让每个人都感到充满活力和快乐。
+输出：
+明媚的阳光让每个人都感受到活力与快乐。
+################
+-真实数据-
+################
+输入：
+{input_sentence}
+################
+请直接输出改写后的句子，不要输出任何额外信息。
+输出：
+"""
+TEMPLATE_EN: str = """-Goal-
+Transform the input sentence into a sentence with the same meaning while:
+1. Preserving most of the original sentence structure
+2. Changing only key words that affect the core meaning
+3. Maintaining the same tone and style
+4. The output sentence should be fluent and grammatically correct
+################
+-Examples-
+################
+Input:
+The bright sunshine made everyone feel energetic and happy.
+Output:
+The bright sunshine made everyone feel energetic and joyful.
+################
+-Real Data-
+################
+Input:
+{input_sentence}
+################
+Please directly output the rewritten sentence without any additional information.
+Output:
+"""
+DESCRIPTION_REPHRASING_PROMPT= {
+    "English": {
+        "ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
+        "TEMPLATE": TEMPLATE_EN
+    },
+    "Chinese": {
+        "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
+        "TEMPLATE": TEMPLATE_ZH
+    }
+}

graphgen/templates/kg_extraction.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships.
+-Goal-
+Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
+Use {language} as output language.
+-Steps-
+1. Identify all entities. For each identified entity, extract the following information:
+- entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
+- entity_type: One of the following types: [{entity_types}]
+- entity_summary: Comprehensive summary of the entity's attributes and activities
+Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
+For each pair of related entities, extract the following information:
+- source_entity: name of the source entity, as identified in step 1
+- target_entity: name of the target entity, as identified in step 1
+- relationship_summary: explanation as to why you think the source entity and the target entity are related to each other
+Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
+Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
+4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+5. When finished, output {completion_delimiter}
+################
+-Examples-
+################
+-Example 1-
+Text:
+################
+In the second century of the Christian Era, the empire of Rome comprehended the fairest part of the earth, and the most civilized portion of mankind. The frontiers of that extensive monarchy were guarded by ancient renown and disciplined valor. The gentle but powerful influence of laws and manners had gradually cemented the union of the provinces. Their peaceful inhabitants enjoyed and abused the advantages of wealth and luxury. The image of a free constitution was preserved with decent reverence: the Roman senate appeared to possess the sovereign authority, and devolved on the emperors all the executive powers of government. During a happy period of more than fourscore years, the public administration was conducted by the virtue and abilities of Nerva, Trajan, Hadrian, and the two Antonines.
+################
+Output:
+("entity"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"organization"{tuple_delimiter}"The dominant empire of the second century CE, encompassing the most developed regions of the known world."){record_delimiter}
+("entity"{tuple_delimiter}"Second Century CE"{tuple_delimiter}"date"{tuple_delimiter}"Time period of the Christian Era when the Roman Empire was at its height."){record_delimiter}
+("entity"{tuple_delimiter}"Rome"{tuple_delimiter}"location"{tuple_delimiter}"The capital and heart of the Roman Empire."){record_delimiter}
+("entity"{tuple_delimiter}"Roman Senate"{tuple_delimiter}"organization"{tuple_delimiter}"Legislative body that appeared to hold sovereign authority in Rome."){record_delimiter}
+("entity"{tuple_delimiter}"Nerva"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor who contributed to the public administration during a prosperous period."){record_delimiter}
+("entity"{tuple_delimiter}"Trajan"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor known for his virtue and administrative abilities."){record_delimiter}
+("entity"{tuple_delimiter}"Hadrian"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor who governed during the empire's peaceful period."){record_delimiter}
+("entity"{tuple_delimiter}"Antonines"{tuple_delimiter}"person"{tuple_delimiter}"Two Roman emperors who ruled during a period of prosperity and good governance."){record_delimiter}
+("entity"{tuple_delimiter}"Roman Law"{tuple_delimiter}"concept"{tuple_delimiter}"System of laws and manners that unified the provinces of the Roman Empire."){record_delimiter}
+("relationship"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Roman Law"{tuple_delimiter}"The empire was unified and maintained through the influence of its laws and customs."){record_delimiter}
+("relationship"{tuple_delimiter}"Roman Senate"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"The Senate appeared to possess sovereign authority while delegating executive powers to emperors."){record_delimiter}
+("relationship"{tuple_delimiter}"Nerva"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Nerva was one of the emperors who contributed to the empire's successful administration."){record_delimiter}
+("relationship"{tuple_delimiter}"Trajan"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Trajan was one of the emperors who governed during the empire's prosperous period."){record_delimiter}
+("relationship"{tuple_delimiter}"Hadrian"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Hadrian was one of the emperors who managed the empire's administration effectively."){record_delimiter}
+("relationship"{tuple_delimiter}"Antonines"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"The Antonines were emperors who helped maintain the empire's prosperity through their governance."){record_delimiter}
+("content_keywords"{tuple_delimiter}"Roman governance, imperial prosperity, law and order, civilized society"){completion_delimiter}
+-Example 2-
+Text:
+#############
+Overall, the analysis of the OsDT11 sequence demonstrated that this protein belongs to the CRP family. Since OsDT11 is predicted to be a secreted protein, the subcellular localization of OsDT11 was determined by fusing the OsDT11 ORF to RFP in a p35S::RFP vector by in vivo protein targeting in NB epidermal cells by performing an Agrobacterium tumefaciens-mediated transient assay. After incubation for 48 h, the RFP signals were mainly detected in the cell-wall of OsDT11-RFP transformed cells, while the control cells (transformed with the RFP construct) displayed ubiquitous RFP signals, demonstrating that OsDT11 is a secreted signal peptide. Moreover, when the infiltrated leaf sections were plasmolyzed, the OsDT11-RFP fusion proteins were located on the cell wall.
+#############
+Output:
+("entity"{tuple_delimiter}"OsDT11"{tuple_delimiter}"gene"{tuple_delimiter}"A protein sequence belonging to the CRP family, demonstrated to be a secreted signal peptide that localizes to cell walls."){record_delimiter}
+("entity"{tuple_delimiter}"CRP family"{tuple_delimiter}"science"{tuple_delimiter}"A protein family to which OsDT11 belongs, characterized by specific structural and functional properties."){record_delimiter}
+("entity"{tuple_delimiter}"RFP"{tuple_delimiter}"technology"{tuple_delimiter}"Red Fluorescent Protein, used as a fusion marker to track protein localization in cells."){record_delimiter}
+("entity"{tuple_delimiter}"p35S::RFP vector"{tuple_delimiter}"technology"{tuple_delimiter}"A genetic construct used for protein expression and visualization studies, containing the 35S promoter and RFP marker."){record_delimiter}
+("entity"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"nature"{tuple_delimiter}"Plant epidermal cells used as the experimental system for protein localization studies."){record_delimiter}
+("entity"{tuple_delimiter}"Agrobacterium tumefaciens"{tuple_delimiter}"nature"{tuple_delimiter}"A bacteria species used for transferring genetic material into plant cells in laboratory experiments."){record_delimiter}
+("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"CRP family"{tuple_delimiter}"OsDT11 is identified as a member of the CRP family through sequence analysis."){record_delimiter}
+("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"RFP"{tuple_delimiter}"OsDT11 was fused to RFP to study its cellular localization."){record_delimiter}
+("relationship"{tuple_delimiter}"Agrobacterium tumefaciens"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"Agrobacterium tumefaciens was used to transfer genetic material into NB epidermal cells through a transient assay."){record_delimiter}
+("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"OsDT11's subcellular localization was studied in NB epidermal cells, showing cell wall targeting."){record_delimiter}
+("content_keywords"{tuple_delimiter}"protein localization, gene expression, cellular biology, molecular techniques"){completion_delimiter}
+################
+-Real Data-
+################
+Entity_types: {entity_types}
+Text: {input_text}
+################
+Output:
+"""
+TEMPLATE_ZH: str = """你是一个NLP专家，擅长分析文本提取命名实体和关系。
+-目标-
+给定一个实体类型列表和可能与列表相关的文本，从文本中识别所有这些类型的实体，以及这些实体之间所有的关系。
+使用{language}作为输出语言。
+-步骤-
+1. 识别所有实体。对于每个识别的实体，提取以下信息：
+   - entity_name：实体的名称，首字母大写
+   - entity_type：以下类型之一：[{entity_types}]
+   - entity_summary：实体的属性与活动的全面总结
+   将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
+   对于每对相关的实体，提取以下信息：
+   - source_entity：步骤1中识别的源实体名称
+   - target_entity：步骤1中识别的目标实体名称
+   - relationship_summary：解释为什么你认为源实体和目标实体彼此相关
+   将每个关系格式化为("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+3. 识别总结整个文本的主要概念、主题或话题的高级关键词。这些应该捕捉文档中存在的总体思想。
+   将内容级关键词格式化为("content_keywords"{tuple_delimiter}<high_level_keywords>)
+4. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。
+5. 完成后，输出{completion_delimiter}
+################
+-示例-
+################
+-示例 1-
+文本：
+################
+鲁镇的酒店的格局，是和别处不同的：都是当街一个曲尺形的大柜台，柜里面预备着热水，可以随时温酒。做工的人，傍午傍晚散了工，每每花四文铜钱，买一碗酒，——这是二十多年前的事，现在每碗要涨到十文，——靠柜外站着，热热的喝了休息；倘肯多花一文，便可以买一碟盐煮笋，或者茴香豆，做下酒物了，如果出到十几文，那就能买一样荤菜，但这些顾客，多是短衣帮，大抵没有这样阔绰。只有穿长衫的，才踱进店面隔壁的房子里，要酒要菜，慢慢地坐喝。
+################
+输出：
+("entity"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"location"{tuple_delimiter}"鲁镇的酒店是一个特定地点，其格局独特，柜台形状为曲尺形，提供热水温酒服务。"){record_delimiter}
+("entity"{tuple_delimiter}"曲尺形的大柜台"{tuple_delimiter}"keyword"{tuple_delimiter}"曲尺形的大柜台是鲁镇酒店内独特的设施，用于提供服务。"){record_delimiter}
+("entity"{tuple_delimiter}"热水温酒"{tuple_delimiter}"keyword"{tuple_delimiter}"热水温酒是鲁镇酒店提供的一项服务，顾客可以随时温酒。"){record_delimiter}
+("entity"{tuple_delimiter}"做工的人"{tuple_delimiter}"person"{tuple_delimiter}"做工的人是鲁镇酒店的常客，通常在工作结束后花四文铜钱买一碗酒，有时还会买一些下酒菜。"){record_delimiter}
+("entity"{tuple_delimiter}"二十多年前的事"{tuple_delimiter}"date"{tuple_delimiter}"二十多年前的事是指过去的时间点，当时一碗酒的价格为四文铜钱。"){record_delimiter}
+("entity"{tuple_delimiter}"现在"{tuple_delimiter}"date"{tuple_delimiter}"现在是指当前的时间点，与过去相比，一碗酒的价格涨到了十文。"){record_delimiter}
+("entity"{tuple_delimiter}"短衣帮"{tuple_delimiter}"concept"{tuple_delimiter}"短衣帮是指做工的人，他们通常穿着短衣，经济条件有限。"){record_delimiter}
+("entity"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"person"{tuple_delimiter}"穿长衫的是鲁镇酒店的另一类顾客，他们经济条件较好，通常会进入店面隔壁的房间慢慢喝酒吃菜。"){record_delimiter}
+("entity"{tuple_delimiter}"盐煮笋"{tuple_delimiter}"food"{tuple_delimiter}"盐煮笋是鲁镇酒店提供的一种下酒菜，顾客可以花一文铜钱购买。"){record_delimiter}
+("entity"{tuple_delimiter}"茴香豆"{tuple_delimiter}"food"{tuple_delimiter}"茴香豆是鲁镇酒店提供的另一种下酒菜，顾客可以花一文铜钱购买。"){record_delimiter}
+("entity"{tuple_delimiter}"荤菜"{tuple_delimiter}"food"{tuple_delimiter}"荤菜是鲁镇酒店提供的较为昂贵的菜品，顾客需要花十几文铜钱购买。"){record_delimiter}
+("relationship"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"曲尺形的大柜台"{tuple_delimiter}"鲁镇的酒店内设有一个曲尺形的大柜台，用于提供服务。"){record_delimiter}
+("relationship"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"热水温酒"{tuple_delimiter}"鲁镇的酒店提供热水温酒服务，顾客可以随时温酒。"){record_delimiter}
+("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"二十多年前的事"{tuple_delimiter}"做工的人在二十多年前花四文铜钱买一碗酒，反映了当时的生活成本。"){record_delimiter}
+("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"现在"{tuple_delimiter}"现在做工的人需要花十文铜钱买一碗酒，反映了物价的上涨。"){record_delimiter}
+("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"短衣帮"{tuple_delimiter}"做工的人属于短衣帮，通常经济条件有限。"){record_delimiter}
+("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"做工的人与穿长衫的形成对比，反映了社会阶层的差异。"){record_delimiter}
+("relationship"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"穿长衫的顾客通常会进入鲁镇酒店的房间慢慢喝酒吃菜，享受更高级的服务。"){record_delimiter}
+("content_keywords"{tuple_delimiter}"社会分层, 经济差距, 服务, 生活成本, 历史背景"){completion_delimiter}
+-示例 2-
+文本：
+################
+黄华占是感温型常规稻品种，2016—2017 年在铅山县汪二镇作中稻示范种植综合表现优良。结合示范情况，对黄华��的特征特性作简单总结，在此基础上提出高产栽培技术，以期为该品种的推广种植提供参考。近年来，铅山县粮食生产紧紧围绕“稳产、优质、增效”的总体要求、大力实施优质稻推广，积极引导粮食生产由增产转向提质。我国杂交水稻技术世界领先、优质稻品种众多，在市场走势方面（尤其稻米行情清淡期），人们习惯性地北涨看长粒香、南涨看黄华占。黄华占是广东省农业科学院水稻研究所以黄新占/丰华占为亲本选育而成，分别通过粤、湘、鄂、浙、桂、琼等省审定。为了更好、更快地推广黄华占水稻，铅山县分别于2016 年、2017 年在汪二镇火田村试验示范种植黄华占近 5.87 hm^2 ，综合表现优良。现将黄华占水稻的特征特性及高产栽培技术介绍如下。
+################
+输出：
+("entity"{tuple_delimiter}"黄华占"{tuple_delimiter}"work"{tuple_delimiter}"黄华占是一种感温型常规稻品种，由广东省农业科学院水稻研究所选育，通过多个省份审定，2016-2017年在铅山县汪二镇进行示范种植，表现优良。"){record_delimiter}
+("entity"{tuple_delimiter}"2016—2017年"{tuple_delimiter}"date"{tuple_delimiter}"2016—2017年是黄华占在铅山县汪二镇进行示范种植的时间段。"){record_delimiter}
+("entity"{tuple_delimiter}"铅山县"{tuple_delimiter}"location"{tuple_delimiter}"铅山县位于中国江西省，是黄华占水稻示范种植的地点之一。"){record_delimiter}
+("entity"{tuple_delimiter}"汪二镇"{tuple_delimiter}"location"{tuple_delimiter}"汪二镇是铅山县的一个镇，2016-2017年在此进行了黄华占水稻的示范种植。"){record_delimiter}
+("entity"{tuple_delimiter}"火田村"{tuple_delimiter}"location"{tuple_delimiter}"火田村是汪二镇的一个村庄，2016-2017年在此进行了黄华占水稻的试验示范种植。"){record_delimiter}
+("entity"{tuple_delimiter}"广东省农业科学院水稻研究所"{tuple_delimiter}"organization"{tuple_delimiter}"广东省农业科学院水稻研究所是中国的一个科研机构，负责黄华占水稻的选育工作。"){record_delimiter}
+("entity"{tuple_delimiter}"黄新占/丰华占"{tuple_delimiter}"work"{tuple_delimiter}"黄新占和丰华占是黄华占水稻的亲本，用于选育黄华占。"){record_delimiter}
+("entity"{tuple_delimiter}"粤、湘、鄂、浙、桂、琼等省"{tuple_delimiter}"location"{tuple_delimiter}"这些省份通过了黄华占水稻的审定，表明该品种在这些地区具有良好的适应性和推广潜力。"){record_delimiter}
+("entity"{tuple_delimiter}"高产栽培技术"{tuple_delimiter}"technology"{tuple_delimiter}"高产栽培技术是指为了提高黄华占水稻产量而采用的一系列农业技术措施。"){record_delimiter}
+("entity"{tuple_delimiter}"稳产、优质、增效"{tuple_delimiter}"concept"{tuple_delimiter}"这是铅山县粮食生产的主要目标，强调了粮食生产的稳定、质量和效益。"){record_delimiter}
+("entity"{tuple_delimiter}"优质稻推广"{tuple_delimiter}"mission"{tuple_delimiter}"优质稻推广是铅山县粮食生产的一个重要任务，旨在提高稻米的质量和市场竞争力。"){record_delimiter}
+("entity"{tuple_delimiter}"杂交水稻技术"{tuple_delimiter}"technology"{tuple_delimiter}"杂交水稻技术是中国领先的世界级农业技术，用于提高水稻的产量和质量。"){record_delimiter}
+("entity"{tuple_delimiter}"北涨看长粒香、南涨看黄华占"{tuple_delimiter}"concept"{tuple_delimiter}"这是市场对不同地区优质稻品种的习惯性关注点，北方面对长粒香，南方面对黄华占。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"2016—2017年"{tuple_delimiter}"黄华占在2016—2017年期间在铅山县进行了示范种植，展示了其优良的特性。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"铅山县"{tuple_delimiter}"黄华占在铅山县进行了示范种植，表现出了优良的适应性和产量。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"汪二镇"{tuple_delimiter}"黄华占在汪二镇进行了示范种植，这是其在铅山县示范种植的一部分。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"火田村"{tuple_delimiter}"黄华占在火田村进行了试验示范种植，这是其在汪二镇示范种植的一部分。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"广东省农业科学院水稻研究所"{tuple_delimiter}"黄华占是由广东省农业科学院水稻研究所选育的，该研究所负责其研发工作。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"黄新占/丰华占"{tuple_delimiter}"黄华占的亲本是黄新占和丰华占，这些亲本用于选育黄华占。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"粤、湘、��、浙、桂、琼等省"{tuple_delimiter}"黄华占通过了这些省份的审定，表明其在这些地区的适应性和推广潜力。"){record_delimiter}
+("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"高产栽培技术"{tuple_delimiter}"高产栽培技术是为了提高黄华占水稻产量而开发的技术措施。"){record_delimiter}
+("relationship"{tuple_delimiter}"铅山县"{tuple_delimiter}"稳产、优质、增效"{tuple_delimiter}"铅山县的粮食生产目标是稳产、优质、增效，这些目标指导了黄华占的示范种植。"){record_delimiter}
+("relationship"{tuple_delimiter}"铅山县"{tuple_delimiter}"优质稻推广"{tuple_delimiter}"铅山县实施了优质稻推广计划，黄华占是该计划的一部分。"){record_delimiter}
+("relationship"{tuple_delimiter}"杂交水稻技术"{tuple_delimiter}"北涨看长粒香、南涨看黄华占"{tuple_delimiter}"杂交水稻技术的发展使得黄华占等优质稻品种在市场中受到关注。"){record_delimiter}
+("content_keywords"{tuple_delimiter}"黄华占, 水稻种植, 高产栽培技术, 优质稻推广, 地区适应性, 市场趋势, 技术影响"){completion_delimiter}
+-真实数据-
+实体类型：{entity_types}
+文本：{input_text}
+################
+输出：
+"""
+CONTINUE_EN: str = """MANY entities and relationships were missed in the last extraction.  \
+Add them below using the same format:
+"""
+CONTINUE_ZH: str = """很多实体和关系在上一次的提取中可能被遗漏了。请在下面使用相同的格式添加它们："""
+IF_LOOP_EN: str = """It appears some entities and relationships may have still been missed.  \
+Answer YES | NO if there are still entities and relationships that need to be added.
+"""
+IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
+KG_EXTRACTION_PROMPT: dict = {
+    "English": {
+        "TEMPLATE": TEMPLATE_EN,
+        "CONTINUE": CONTINUE_EN,
+        "IF_LOOP": IF_LOOP_EN,
+    },
+    "Chinese": {
+        "TEMPLATE": TEMPLATE_ZH,
+        "CONTINUE": CONTINUE_ZH,
+        "IF_LOOP": IF_LOOP_ZH,
+    },
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
+science, technology, mission, gene",
+        "language": "English",
+    },
+}