Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
d2a63cc
1
Parent(s):
4e14cb8
Auto-sync from demo at Tue Sep 23 09:05:50 UTC 2025
Browse files- app.py +5 -33
- graphgen/{version.py → _version.py} +5 -6
- graphgen/{models/embed → bases}/__init__.py +0 -0
- graphgen/bases/base_reader.py +20 -0
- graphgen/{models/storage → bases}/base_storage.py +0 -4
- graphgen/configs/aggregated_config.yaml +1 -2
- graphgen/configs/atomic_config.yaml +1 -2
- graphgen/configs/cot_config.yaml +1 -2
- graphgen/configs/multi_hop_config.yaml +1 -2
- graphgen/graphgen.py +43 -84
- graphgen/judge.py +0 -60
- graphgen/models/__init__.py +1 -0
- graphgen/models/embed/embedding.py +0 -29
- graphgen/models/reader/__init__.py +22 -0
- graphgen/models/reader/csv_reader.py +14 -0
- graphgen/models/reader/json_reader.py +18 -0
- graphgen/models/reader/jsonl_reader.py +23 -0
- graphgen/models/reader/txt_reader.py +14 -0
- graphgen/models/storage/json_storage.py +1 -1
- graphgen/models/storage/networkx_storage.py +25 -9
- graphgen/models/strategy/base_strategy.py +0 -5
- graphgen/models/strategy/travserse_strategy.py +5 -7
- graphgen/operators/kg/extract_kg.py +1 -1
- graphgen/operators/kg/merge_kg.py +1 -1
- graphgen/utils/__init__.py +0 -1
- graphgen/utils/file.py +0 -24
- webui/app.py +5 -33
- webui/count_tokens.py +16 -17
- webui/examples/chunked_demo.json +0 -14
- webui/examples/csv_demo.csv +5 -0
- webui/examples/json_demo.json +6 -0
- webui/examples/{raw_demo.jsonl → jsonl_demo.jsonl} +0 -0
app.py
CHANGED
|
@@ -116,35 +116,6 @@ def run_graphgen(params, progress=gr.Progress()):
|
|
| 116 |
env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
|
| 117 |
)
|
| 118 |
|
| 119 |
-
# Load input data
|
| 120 |
-
file = config["input_file"]
|
| 121 |
-
if isinstance(file, list):
|
| 122 |
-
file = file[0]
|
| 123 |
-
|
| 124 |
-
data = []
|
| 125 |
-
|
| 126 |
-
if file.endswith(".jsonl"):
|
| 127 |
-
config["input_data_type"] = "raw"
|
| 128 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 129 |
-
data.extend(json.loads(line) for line in f)
|
| 130 |
-
elif file.endswith(".json"):
|
| 131 |
-
config["input_data_type"] = "chunked"
|
| 132 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 133 |
-
data.extend(json.load(f))
|
| 134 |
-
elif file.endswith(".txt"):
|
| 135 |
-
# 读取文件后根据chunk_size转成raw格式的数据
|
| 136 |
-
config["input_data_type"] = "raw"
|
| 137 |
-
content = ""
|
| 138 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 139 |
-
lines = f.readlines()
|
| 140 |
-
for line in lines:
|
| 141 |
-
content += line.strip() + " "
|
| 142 |
-
size = int(config.get("chunk_size", 512))
|
| 143 |
-
chunks = [content[i : i + size] for i in range(0, len(content), size)]
|
| 144 |
-
data.extend([{"content": chunk} for chunk in chunks])
|
| 145 |
-
else:
|
| 146 |
-
raise ValueError(f"Unsupported file type: {file}")
|
| 147 |
-
|
| 148 |
# Initialize GraphGen
|
| 149 |
graph_gen = init_graph_gen(config, env)
|
| 150 |
graph_gen.clear()
|
|
@@ -436,19 +407,20 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 436 |
upload_file = gr.File(
|
| 437 |
label=_("Upload File"),
|
| 438 |
file_count="single",
|
| 439 |
-
file_types=[".txt", ".json", ".jsonl"],
|
| 440 |
interactive=True,
|
| 441 |
)
|
| 442 |
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 443 |
gr.Examples(
|
| 444 |
examples=[
|
| 445 |
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 446 |
-
[os.path.join(examples_dir, "
|
| 447 |
-
[os.path.join(examples_dir, "
|
|
|
|
| 448 |
],
|
| 449 |
inputs=upload_file,
|
| 450 |
label=_("Example Files"),
|
| 451 |
-
examples_per_page=
|
| 452 |
)
|
| 453 |
with gr.Column(scale=1):
|
| 454 |
output = gr.File(
|
|
|
|
| 116 |
env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
|
| 117 |
)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# Initialize GraphGen
|
| 120 |
graph_gen = init_graph_gen(config, env)
|
| 121 |
graph_gen.clear()
|
|
|
|
| 407 |
upload_file = gr.File(
|
| 408 |
label=_("Upload File"),
|
| 409 |
file_count="single",
|
| 410 |
+
file_types=[".txt", ".json", ".jsonl", ".csv"],
|
| 411 |
interactive=True,
|
| 412 |
)
|
| 413 |
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 414 |
gr.Examples(
|
| 415 |
examples=[
|
| 416 |
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 417 |
+
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 418 |
+
[os.path.join(examples_dir, "json_demo.json")],
|
| 419 |
+
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 420 |
],
|
| 421 |
inputs=upload_file,
|
| 422 |
label=_("Example Files"),
|
| 423 |
+
examples_per_page=4,
|
| 424 |
)
|
| 425 |
with gr.Column(scale=1):
|
| 426 |
output = gr.File(
|
graphgen/{version.py → _version.py}
RENAMED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
-
|
| 2 |
from typing import Tuple
|
| 3 |
|
| 4 |
-
__version__ =
|
| 5 |
short_version = __version__
|
| 6 |
|
| 7 |
|
|
@@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple:
|
|
| 15 |
tuple: A sequence of integer and string represents version.
|
| 16 |
"""
|
| 17 |
_version_info = []
|
| 18 |
-
for x in version_str.split(
|
| 19 |
if x.isdigit():
|
| 20 |
_version_info.append(int(x))
|
| 21 |
-
elif x.find(
|
| 22 |
-
patch_version = x.split(
|
| 23 |
_version_info.append(int(patch_version[0]))
|
| 24 |
-
_version_info.append(f
|
| 25 |
return tuple(_version_info)
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
| 1 |
from typing import Tuple
|
| 2 |
|
| 3 |
+
__version__ = "20250416"
|
| 4 |
short_version = __version__
|
| 5 |
|
| 6 |
|
|
|
|
| 14 |
tuple: A sequence of integer and string represents version.
|
| 15 |
"""
|
| 16 |
_version_info = []
|
| 17 |
+
for x in version_str.split("."):
|
| 18 |
if x.isdigit():
|
| 19 |
_version_info.append(int(x))
|
| 20 |
+
elif x.find("rc") != -1:
|
| 21 |
+
patch_version = x.split("rc")
|
| 22 |
_version_info.append(int(patch_version[0]))
|
| 23 |
+
_version_info.append(f"rc{patch_version[1]}")
|
| 24 |
return tuple(_version_info)
|
| 25 |
|
| 26 |
|
graphgen/{models/embed → bases}/__init__.py
RENAMED
|
File without changes
|
graphgen/bases/base_reader.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BaseReader(ABC):
|
| 6 |
+
"""
|
| 7 |
+
Abstract base class for reading and processing data.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
def __init__(self, text_column: str = "content"):
|
| 11 |
+
self.text_column = text_column
|
| 12 |
+
|
| 13 |
+
@abstractmethod
|
| 14 |
+
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 15 |
+
"""
|
| 16 |
+
Read data from the specified file path.
|
| 17 |
+
|
| 18 |
+
:param file_path: Path to the input file.
|
| 19 |
+
:return: List of dictionaries containing the data.
|
| 20 |
+
"""
|
graphgen/{models/storage → bases}/base_storage.py
RENAMED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from typing import Generic, TypeVar, Union
|
| 3 |
|
| 4 |
-
from graphgen.models.embed.embedding import EmbeddingFunc
|
| 5 |
-
|
| 6 |
T = TypeVar("T")
|
| 7 |
|
| 8 |
|
|
@@ -62,8 +60,6 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
|
|
| 62 |
|
| 63 |
@dataclass
|
| 64 |
class BaseGraphStorage(StorageNameSpace):
|
| 65 |
-
embedding_func: EmbeddingFunc = None
|
| 66 |
-
|
| 67 |
async def has_node(self, node_id: str) -> bool:
|
| 68 |
raise NotImplementedError
|
| 69 |
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from typing import Generic, TypeVar, Union
|
| 3 |
|
|
|
|
|
|
|
| 4 |
T = TypeVar("T")
|
| 5 |
|
| 6 |
|
|
|
|
| 60 |
|
| 61 |
@dataclass
|
| 62 |
class BaseGraphStorage(StorageNameSpace):
|
|
|
|
|
|
|
| 63 |
async def has_node(self, node_id: str) -> bool:
|
| 64 |
raise NotImplementedError
|
| 65 |
|
graphgen/configs/aggregated_config.yaml
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
| 3 |
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
|
| 4 |
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 5 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
|
|
|
| 1 |
+
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
|
|
|
| 2 |
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
|
| 3 |
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 4 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
graphgen/configs/atomic_config.yaml
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
| 3 |
output_data_type: atomic # atomic, aggregated, multi_hop, cot
|
| 4 |
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
| 5 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
|
|
|
| 1 |
+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
|
|
|
|
| 2 |
output_data_type: atomic # atomic, aggregated, multi_hop, cot
|
| 3 |
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
| 4 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
graphgen/configs/cot_config.yaml
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
| 3 |
output_data_type: cot # atomic, aggregated, multi_hop, cot
|
| 4 |
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
| 5 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
|
|
|
| 1 |
+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
|
|
|
| 2 |
output_data_type: cot # atomic, aggregated, multi_hop, cot
|
| 3 |
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
| 4 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
graphgen/configs/multi_hop_config.yaml
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
| 3 |
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
|
| 4 |
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 5 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
|
|
|
| 1 |
+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
|
|
|
|
| 2 |
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
|
| 3 |
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 4 |
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
graphgen/graphgen.py
CHANGED
|
@@ -7,7 +7,8 @@ from typing import Dict, List, Union, cast
|
|
| 7 |
import gradio as gr
|
| 8 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 9 |
|
| 10 |
-
from .
|
|
|
|
| 11 |
Chunk,
|
| 12 |
JsonKVStorage,
|
| 13 |
JsonListStorage,
|
|
@@ -15,8 +16,9 @@ from .models import (
|
|
| 15 |
OpenAIModel,
|
| 16 |
Tokenizer,
|
| 17 |
TraverseStrategy,
|
|
|
|
| 18 |
)
|
| 19 |
-
|
| 20 |
from .operators import (
|
| 21 |
extract_kg,
|
| 22 |
generate_cot,
|
|
@@ -32,7 +34,6 @@ from .utils import (
|
|
| 32 |
create_event_loop,
|
| 33 |
format_generation_results,
|
| 34 |
logger,
|
| 35 |
-
read_file,
|
| 36 |
)
|
| 37 |
|
| 38 |
sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
@@ -108,94 +109,54 @@ class GraphGen:
|
|
| 108 |
namespace=f"qa-{self.unique_id}",
|
| 109 |
)
|
| 110 |
|
| 111 |
-
async def async_split_chunks(
|
| 112 |
-
self, data: List[Union[List, Dict]], data_type: str
|
| 113 |
-
) -> dict:
|
| 114 |
# TODO: configurable whether to use coreference resolution
|
| 115 |
if len(data) == 0:
|
| 116 |
return {}
|
| 117 |
|
| 118 |
inserting_chunks = {}
|
| 119 |
-
|
| 120 |
-
assert isinstance(data, list) and isinstance(data[0], dict)
|
| 121 |
-
# compute hash for each document
|
| 122 |
-
new_docs = {
|
| 123 |
-
compute_content_hash(doc["content"], prefix="doc-"): {
|
| 124 |
-
"content": doc["content"]
|
| 125 |
-
}
|
| 126 |
-
for doc in data
|
| 127 |
-
}
|
| 128 |
-
_add_doc_keys = await self.full_docs_storage.filter_keys(
|
| 129 |
-
list(new_docs.keys())
|
| 130 |
-
)
|
| 131 |
-
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 132 |
-
if len(new_docs) == 0:
|
| 133 |
-
logger.warning("All docs are already in the storage")
|
| 134 |
-
return {}
|
| 135 |
-
logger.info("[New Docs] inserting %d docs", len(new_docs))
|
| 136 |
-
|
| 137 |
-
cur_index = 1
|
| 138 |
-
doc_number = len(new_docs)
|
| 139 |
-
async for doc_key, doc in tqdm_async(
|
| 140 |
-
new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
|
| 141 |
-
):
|
| 142 |
-
chunks = {
|
| 143 |
-
compute_content_hash(dp["content"], prefix="chunk-"): {
|
| 144 |
-
**dp,
|
| 145 |
-
"full_doc_id": doc_key,
|
| 146 |
-
}
|
| 147 |
-
for dp in self.tokenizer_instance.chunk_by_token_size(
|
| 148 |
-
doc["content"], self.chunk_overlap_size, self.chunk_size
|
| 149 |
-
)
|
| 150 |
-
}
|
| 151 |
-
inserting_chunks.update(chunks)
|
| 152 |
-
|
| 153 |
-
if self.progress_bar is not None:
|
| 154 |
-
self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
|
| 155 |
-
cur_index += 1
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
|
| 162 |
}
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
-
for
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
_add_doc_keys = await self.full_docs_storage.filter_keys(
|
| 173 |
-
list(new_docs.keys())
|
| 174 |
-
)
|
| 175 |
-
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 176 |
-
if len(new_docs) == 0:
|
| 177 |
-
logger.warning("All docs are already in the storage")
|
| 178 |
-
return {}
|
| 179 |
-
logger.info("[New Docs] inserting %d docs", len(new_docs))
|
| 180 |
-
async for doc in tqdm_async(
|
| 181 |
-
data, desc="[1/4]Chunking documents", unit="doc"
|
| 182 |
-
):
|
| 183 |
-
doc_str = "".join([chunk["content"] for chunk in doc])
|
| 184 |
-
for chunk in doc:
|
| 185 |
-
chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
|
| 186 |
-
inserting_chunks[chunk_key] = {
|
| 187 |
-
**chunk,
|
| 188 |
-
"full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
|
| 189 |
-
}
|
| 190 |
-
_add_chunk_keys = await self.text_chunks_storage.filter_keys(
|
| 191 |
-
list(inserting_chunks.keys())
|
| 192 |
-
)
|
| 193 |
-
inserting_chunks = {
|
| 194 |
-
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
|
| 195 |
}
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
await self.full_docs_storage.upsert(new_docs)
|
| 200 |
await self.text_chunks_storage.upsert(inserting_chunks)
|
| 201 |
|
|
@@ -211,10 +172,8 @@ class GraphGen:
|
|
| 211 |
"""
|
| 212 |
|
| 213 |
input_file = self.config["input_file"]
|
| 214 |
-
data_type = self.config["input_data_type"]
|
| 215 |
data = read_file(input_file)
|
| 216 |
-
|
| 217 |
-
inserting_chunks = await self.async_split_chunks(data, data_type)
|
| 218 |
|
| 219 |
if len(inserting_chunks) == 0:
|
| 220 |
logger.warning("All chunks are already in the storage")
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 9 |
|
| 10 |
+
from graphgen.bases.base_storage import StorageNameSpace
|
| 11 |
+
from graphgen.models import (
|
| 12 |
Chunk,
|
| 13 |
JsonKVStorage,
|
| 14 |
JsonListStorage,
|
|
|
|
| 16 |
OpenAIModel,
|
| 17 |
Tokenizer,
|
| 18 |
TraverseStrategy,
|
| 19 |
+
read_file,
|
| 20 |
)
|
| 21 |
+
|
| 22 |
from .operators import (
|
| 23 |
extract_kg,
|
| 24 |
generate_cot,
|
|
|
|
| 34 |
create_event_loop,
|
| 35 |
format_generation_results,
|
| 36 |
logger,
|
|
|
|
| 37 |
)
|
| 38 |
|
| 39 |
sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
| 109 |
namespace=f"qa-{self.unique_id}",
|
| 110 |
)
|
| 111 |
|
| 112 |
+
async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
|
|
|
|
|
|
|
| 113 |
# TODO: configurable whether to use coreference resolution
|
| 114 |
if len(data) == 0:
|
| 115 |
return {}
|
| 116 |
|
| 117 |
inserting_chunks = {}
|
| 118 |
+
assert isinstance(data, list) and isinstance(data[0], dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
# compute hash for each document
|
| 121 |
+
new_docs = {
|
| 122 |
+
compute_content_hash(doc["content"], prefix="doc-"): {
|
| 123 |
+
"content": doc["content"]
|
|
|
|
| 124 |
}
|
| 125 |
+
for doc in data
|
| 126 |
+
}
|
| 127 |
+
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
|
| 128 |
+
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 129 |
+
if len(new_docs) == 0:
|
| 130 |
+
logger.warning("All docs are already in the storage")
|
| 131 |
+
return {}
|
| 132 |
+
logger.info("[New Docs] inserting %d docs", len(new_docs))
|
| 133 |
+
|
| 134 |
+
cur_index = 1
|
| 135 |
+
doc_number = len(new_docs)
|
| 136 |
+
async for doc_key, doc in tqdm_async(
|
| 137 |
+
new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
|
| 138 |
+
):
|
| 139 |
+
chunks = {
|
| 140 |
+
compute_content_hash(dp["content"], prefix="chunk-"): {
|
| 141 |
+
**dp,
|
| 142 |
+
"full_doc_id": doc_key,
|
| 143 |
}
|
| 144 |
+
for dp in self.tokenizer_instance.chunk_by_token_size(
|
| 145 |
+
doc["content"], self.chunk_overlap_size, self.chunk_size
|
| 146 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
+
inserting_chunks.update(chunks)
|
| 149 |
+
|
| 150 |
+
if self.progress_bar is not None:
|
| 151 |
+
self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
|
| 152 |
+
cur_index += 1
|
| 153 |
|
| 154 |
+
_add_chunk_keys = await self.text_chunks_storage.filter_keys(
|
| 155 |
+
list(inserting_chunks.keys())
|
| 156 |
+
)
|
| 157 |
+
inserting_chunks = {
|
| 158 |
+
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
|
| 159 |
+
}
|
| 160 |
await self.full_docs_storage.upsert(new_docs)
|
| 161 |
await self.text_chunks_storage.upsert(inserting_chunks)
|
| 162 |
|
|
|
|
| 172 |
"""
|
| 173 |
|
| 174 |
input_file = self.config["input_file"]
|
|
|
|
| 175 |
data = read_file(input_file)
|
| 176 |
+
inserting_chunks = await self.async_split_chunks(data)
|
|
|
|
| 177 |
|
| 178 |
if len(inserting_chunks) == 0:
|
| 179 |
logger.warning("All chunks are already in the storage")
|
graphgen/judge.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import argparse
|
| 3 |
-
import asyncio
|
| 4 |
-
from dotenv import load_dotenv
|
| 5 |
-
|
| 6 |
-
from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
|
| 7 |
-
from .operators import judge_statement
|
| 8 |
-
|
| 9 |
-
sys_path = os.path.abspath(os.path.dirname(__file__))
|
| 10 |
-
|
| 11 |
-
load_dotenv()
|
| 12 |
-
|
| 13 |
-
def calculate_average_loss(graph: NetworkXStorage):
|
| 14 |
-
"""
|
| 15 |
-
Calculate the average loss of the graph.
|
| 16 |
-
|
| 17 |
-
:param graph: NetworkXStorage
|
| 18 |
-
:return: float
|
| 19 |
-
"""
|
| 20 |
-
edges = asyncio.run(graph.get_all_edges())
|
| 21 |
-
total_loss = 0
|
| 22 |
-
for edge in edges:
|
| 23 |
-
total_loss += edge[2]['loss']
|
| 24 |
-
return total_loss / len(edges)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
if __name__ == '__main__':
|
| 29 |
-
parser = argparse.ArgumentParser()
|
| 30 |
-
parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
|
| 31 |
-
parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
|
| 32 |
-
|
| 33 |
-
args = parser.parse_args()
|
| 34 |
-
|
| 35 |
-
llm_client = OpenAIModel(
|
| 36 |
-
model_name=os.getenv("TRAINEE_MODEL"),
|
| 37 |
-
api_key=os.getenv("TRAINEE_API_KEY"),
|
| 38 |
-
base_url=os.getenv("TRAINEE_BASE_URL")
|
| 39 |
-
)
|
| 40 |
-
|
| 41 |
-
graph_storage = NetworkXStorage(
|
| 42 |
-
args.input,
|
| 43 |
-
namespace="graph"
|
| 44 |
-
)
|
| 45 |
-
average_loss = calculate_average_loss(graph_storage)
|
| 46 |
-
print(f"Average loss of the graph: {average_loss}")
|
| 47 |
-
|
| 48 |
-
rephrase_storage = JsonKVStorage(
|
| 49 |
-
os.path.join(sys_path, "cache"),
|
| 50 |
-
namespace="rephrase"
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
|
| 54 |
-
|
| 55 |
-
graph_file = asyncio.run(graph_storage.get_graph())
|
| 56 |
-
|
| 57 |
-
new_graph.write_nx_graph(graph_file, args.output)
|
| 58 |
-
|
| 59 |
-
average_loss = calculate_average_loss(new_graph)
|
| 60 |
-
print(f"Average loss of the graph: {average_loss}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graphgen/models/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from .evaluate.uni_evaluator import UniEvaluator
|
|
| 6 |
from .llm.openai_model import OpenAIModel
|
| 7 |
from .llm.tokenizer import Tokenizer
|
| 8 |
from .llm.topk_token_model import Token, TopkTokenModel
|
|
|
|
| 9 |
from .search.db.uniprot_search import UniProtSearch
|
| 10 |
from .search.kg.wiki_search import WikiSearch
|
| 11 |
from .search.web.bing_search import BingSearch
|
|
|
|
| 6 |
from .llm.openai_model import OpenAIModel
|
| 7 |
from .llm.tokenizer import Tokenizer
|
| 8 |
from .llm.topk_token_model import Token, TopkTokenModel
|
| 9 |
+
from .reader import read_file
|
| 10 |
from .search.db.uniprot_search import UniProtSearch
|
| 11 |
from .search.kg.wiki_search import WikiSearch
|
| 12 |
from .search.web.bing_search import BingSearch
|
graphgen/models/embed/embedding.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
from dataclasses import dataclass
|
| 2 |
-
import asyncio
|
| 3 |
-
import numpy as np
|
| 4 |
-
|
| 5 |
-
class UnlimitedSemaphore:
|
| 6 |
-
"""A context manager that allows unlimited access."""
|
| 7 |
-
|
| 8 |
-
async def __aenter__(self):
|
| 9 |
-
pass
|
| 10 |
-
|
| 11 |
-
async def __aexit__(self, exc_type, exc, tb):
|
| 12 |
-
pass
|
| 13 |
-
|
| 14 |
-
@dataclass
|
| 15 |
-
class EmbeddingFunc:
|
| 16 |
-
embedding_dim: int
|
| 17 |
-
max_token_size: int
|
| 18 |
-
func: callable
|
| 19 |
-
concurrent_limit: int = 16
|
| 20 |
-
|
| 21 |
-
def __post_init__(self):
|
| 22 |
-
if self.concurrent_limit != 0:
|
| 23 |
-
self._semaphore = asyncio.Semaphore(self.concurrent_limit)
|
| 24 |
-
else:
|
| 25 |
-
self._semaphore = UnlimitedSemaphore()
|
| 26 |
-
|
| 27 |
-
async def __call__(self, *args, **kwargs) -> np.ndarray:
|
| 28 |
-
async with self._semaphore:
|
| 29 |
-
return await self.func(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graphgen/models/reader/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .csv_reader import CsvReader
|
| 2 |
+
from .json_reader import JsonReader
|
| 3 |
+
from .jsonl_reader import JsonlReader
|
| 4 |
+
from .txt_reader import TxtReader
|
| 5 |
+
|
| 6 |
+
_MAPPING = {
|
| 7 |
+
"jsonl": JsonlReader,
|
| 8 |
+
"json": JsonReader,
|
| 9 |
+
"txt": TxtReader,
|
| 10 |
+
"csv": CsvReader,
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def read_file(file_path: str):
|
| 15 |
+
suffix = file_path.split(".")[-1]
|
| 16 |
+
if suffix in _MAPPING:
|
| 17 |
+
reader = _MAPPING[suffix]()
|
| 18 |
+
else:
|
| 19 |
+
raise ValueError(
|
| 20 |
+
f"Unsupported file format: {suffix}. Supported formats are: {list(_MAPPING.keys())}"
|
| 21 |
+
)
|
| 22 |
+
return reader.read(file_path)
|
graphgen/models/reader/csv_reader.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from graphgen.bases.base_reader import BaseReader
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class CsvReader(BaseReader):
|
| 9 |
+
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 10 |
+
|
| 11 |
+
df = pd.read_csv(file_path)
|
| 12 |
+
if self.text_column not in df.columns:
|
| 13 |
+
raise ValueError(f"Missing '{self.text_column}' column in CSV file.")
|
| 14 |
+
return df.to_dict(orient="records")
|
graphgen/models/reader/json_reader.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
+
|
| 4 |
+
from graphgen.bases.base_reader import BaseReader
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class JsonReader(BaseReader):
|
| 8 |
+
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 9 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 10 |
+
data = json.load(f)
|
| 11 |
+
if isinstance(data, list):
|
| 12 |
+
for doc in data:
|
| 13 |
+
if self.text_column not in doc:
|
| 14 |
+
raise ValueError(
|
| 15 |
+
f"Missing '{self.text_column}' in document: {doc}"
|
| 16 |
+
)
|
| 17 |
+
return data
|
| 18 |
+
raise ValueError("JSON file must contain a list of documents.")
|
graphgen/models/reader/jsonl_reader.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
+
|
| 4 |
+
from graphgen.bases.base_reader import BaseReader
|
| 5 |
+
from graphgen.utils import logger
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class JsonlReader(BaseReader):
|
| 9 |
+
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 10 |
+
docs = []
|
| 11 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 12 |
+
for line in f:
|
| 13 |
+
try:
|
| 14 |
+
doc = json.loads(line)
|
| 15 |
+
if self.text_column in doc:
|
| 16 |
+
docs.append(doc)
|
| 17 |
+
else:
|
| 18 |
+
raise ValueError(
|
| 19 |
+
f"Missing '{self.text_column}' in document: {doc}"
|
| 20 |
+
)
|
| 21 |
+
except json.JSONDecodeError as e:
|
| 22 |
+
logger.error("Error decoding JSON line: %s. Error: %s", line, e)
|
| 23 |
+
return docs
|
graphgen/models/reader/txt_reader.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List
|
| 2 |
+
|
| 3 |
+
from graphgen.bases.base_reader import BaseReader
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TxtReader(BaseReader):
|
| 7 |
+
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 8 |
+
docs = []
|
| 9 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 10 |
+
for line in f:
|
| 11 |
+
line = line.strip()
|
| 12 |
+
if line:
|
| 13 |
+
docs.append({self.text_column: line})
|
| 14 |
+
return docs
|
graphgen/models/storage/json_storage.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
from dataclasses import dataclass
|
| 3 |
|
| 4 |
-
from graphgen.
|
| 5 |
from graphgen.utils import load_json, logger, write_json
|
| 6 |
|
| 7 |
|
|
|
|
| 1 |
import os
|
| 2 |
from dataclasses import dataclass
|
| 3 |
|
| 4 |
+
from graphgen.bases.base_storage import BaseKVStorage, BaseListStorage
|
| 5 |
from graphgen.utils import load_json, logger, write_json
|
| 6 |
|
| 7 |
|
graphgen/models/storage/networkx_storage.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
-
import os
|
| 2 |
import html
|
| 3 |
-
|
| 4 |
from dataclasses import dataclass
|
|
|
|
|
|
|
| 5 |
import networkx as nx
|
| 6 |
|
|
|
|
| 7 |
from graphgen.utils import logger
|
| 8 |
-
|
| 9 |
|
| 10 |
@dataclass
|
| 11 |
class NetworkXStorage(BaseGraphStorage):
|
|
@@ -17,7 +19,11 @@ class NetworkXStorage(BaseGraphStorage):
|
|
| 17 |
|
| 18 |
@staticmethod
|
| 19 |
def write_nx_graph(graph: nx.Graph, file_name):
|
| 20 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
nx.write_graphml(graph, file_name)
|
| 22 |
|
| 23 |
@staticmethod
|
|
@@ -77,8 +83,10 @@ class NetworkXStorage(BaseGraphStorage):
|
|
| 77 |
preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
|
| 78 |
if preloaded_graph is not None:
|
| 79 |
logger.info(
|
| 80 |
-
"Loaded graph from %s with %d nodes, %d edges",
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
)
|
| 83 |
self._graph = preloaded_graph or nx.Graph()
|
| 84 |
|
|
@@ -111,7 +119,9 @@ class NetworkXStorage(BaseGraphStorage):
|
|
| 111 |
async def get_all_edges(self) -> Union[list[dict], None]:
|
| 112 |
return self._graph.edges(data=True)
|
| 113 |
|
| 114 |
-
async def get_node_edges(
|
|
|
|
|
|
|
| 115 |
if self._graph.has_node(source_node_id):
|
| 116 |
return list(self._graph.edges(source_node_id, data=True))
|
| 117 |
return None
|
|
@@ -133,11 +143,17 @@ class NetworkXStorage(BaseGraphStorage):
|
|
| 133 |
):
|
| 134 |
self._graph.add_edge(source_node_id, target_node_id, **edge_data)
|
| 135 |
|
| 136 |
-
async def update_edge(
|
|
|
|
|
|
|
| 137 |
if self._graph.has_edge(source_node_id, target_node_id):
|
| 138 |
self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
|
| 139 |
else:
|
| 140 |
-
logger.warning(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
async def delete_node(self, node_id: str):
|
| 143 |
"""
|
|
|
|
|
|
|
| 1 |
import html
|
| 2 |
+
import os
|
| 3 |
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Optional, Union, cast
|
| 5 |
+
|
| 6 |
import networkx as nx
|
| 7 |
|
| 8 |
+
from graphgen.bases.base_storage import BaseGraphStorage
|
| 9 |
from graphgen.utils import logger
|
| 10 |
+
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class NetworkXStorage(BaseGraphStorage):
|
|
|
|
| 19 |
|
| 20 |
@staticmethod
|
| 21 |
def write_nx_graph(graph: nx.Graph, file_name):
|
| 22 |
+
logger.info(
|
| 23 |
+
"Writing graph with %d nodes, %d edges",
|
| 24 |
+
graph.number_of_nodes(),
|
| 25 |
+
graph.number_of_edges(),
|
| 26 |
+
)
|
| 27 |
nx.write_graphml(graph, file_name)
|
| 28 |
|
| 29 |
@staticmethod
|
|
|
|
| 83 |
preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
|
| 84 |
if preloaded_graph is not None:
|
| 85 |
logger.info(
|
| 86 |
+
"Loaded graph from %s with %d nodes, %d edges",
|
| 87 |
+
self._graphml_xml_file,
|
| 88 |
+
preloaded_graph.number_of_nodes(),
|
| 89 |
+
preloaded_graph.number_of_edges(),
|
| 90 |
)
|
| 91 |
self._graph = preloaded_graph or nx.Graph()
|
| 92 |
|
|
|
|
| 119 |
async def get_all_edges(self) -> Union[list[dict], None]:
|
| 120 |
return self._graph.edges(data=True)
|
| 121 |
|
| 122 |
+
async def get_node_edges(
|
| 123 |
+
self, source_node_id: str
|
| 124 |
+
) -> Union[list[tuple[str, str]], None]:
|
| 125 |
if self._graph.has_node(source_node_id):
|
| 126 |
return list(self._graph.edges(source_node_id, data=True))
|
| 127 |
return None
|
|
|
|
| 143 |
):
|
| 144 |
self._graph.add_edge(source_node_id, target_node_id, **edge_data)
|
| 145 |
|
| 146 |
+
async def update_edge(
|
| 147 |
+
self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
|
| 148 |
+
):
|
| 149 |
if self._graph.has_edge(source_node_id, target_node_id):
|
| 150 |
self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
|
| 151 |
else:
|
| 152 |
+
logger.warning(
|
| 153 |
+
"Edge %s -> %s not found in the graph for update.",
|
| 154 |
+
source_node_id,
|
| 155 |
+
target_node_id,
|
| 156 |
+
)
|
| 157 |
|
| 158 |
async def delete_node(self, node_id: str):
|
| 159 |
"""
|
graphgen/models/strategy/base_strategy.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from dataclasses import dataclass
|
| 2 |
-
|
| 3 |
-
@dataclass
|
| 4 |
-
class BaseStrategy:
|
| 5 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graphgen/models/strategy/travserse_strategy.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
from dataclasses import dataclass, fields
|
| 2 |
|
| 3 |
-
from graphgen.models.strategy.base_strategy import BaseStrategy
|
| 4 |
-
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
-
class TraverseStrategy
|
| 8 |
# 生成的QA形式:原子、多跳、聚合型
|
| 9 |
-
qa_form: str = "atomic"
|
| 10 |
# 最大边数和最大token数方法中选择一个生效
|
| 11 |
-
expand_method: str = "max_tokens"
|
| 12 |
# 单向拓展还是双向拓展
|
| 13 |
bidirectional: bool = True
|
| 14 |
# 每个方向拓展的最大边数
|
|
@@ -18,9 +16,9 @@ class TraverseStrategy(BaseStrategy):
|
|
| 18 |
# 每个方向拓展的最大深度
|
| 19 |
max_depth: int = 2
|
| 20 |
# 同一层中选边的策略(如果是双向拓展,同一层指的是两边连接的边的集合)
|
| 21 |
-
edge_sampling: str = "max_loss"
|
| 22 |
# 孤立节点的处理策略
|
| 23 |
-
isolated_node_strategy: str = "add"
|
| 24 |
loss_strategy: str = "only_edge" # only_edge, both
|
| 25 |
|
| 26 |
def to_yaml(self):
|
|
|
|
| 1 |
from dataclasses import dataclass, fields
|
| 2 |
|
|
|
|
|
|
|
| 3 |
|
| 4 |
@dataclass
|
| 5 |
+
class TraverseStrategy:
|
| 6 |
# 生成的QA形式:原子、多跳、聚合型
|
| 7 |
+
qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
|
| 8 |
# 最大边数和最大token数方法中选择一个生效
|
| 9 |
+
expand_method: str = "max_tokens" # "max_width" or "max_tokens"
|
| 10 |
# 单向拓展还是双向拓展
|
| 11 |
bidirectional: bool = True
|
| 12 |
# 每个方向拓展的最大边数
|
|
|
|
| 16 |
# 每个方向拓展的最大深度
|
| 17 |
max_depth: int = 2
|
| 18 |
# 同一层中选边的策略(如果是双向拓展,同一层指的是两边连接的边的集合)
|
| 19 |
+
edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
|
| 20 |
# 孤立节点的处理策略
|
| 21 |
+
isolated_node_strategy: str = "add" # "add" or "ignore"
|
| 22 |
loss_strategy: str = "only_edge" # only_edge, both
|
| 23 |
|
| 24 |
def to_yaml(self):
|
graphgen/operators/kg/extract_kg.py
CHANGED
|
@@ -6,8 +6,8 @@ from typing import List
|
|
| 6 |
import gradio as gr
|
| 7 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 8 |
|
|
|
|
| 9 |
from graphgen.models import Chunk, OpenAIModel, Tokenizer
|
| 10 |
-
from graphgen.models.storage.base_storage import BaseGraphStorage
|
| 11 |
from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
|
| 12 |
from graphgen.templates import KG_EXTRACTION_PROMPT
|
| 13 |
from graphgen.utils import (
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 8 |
|
| 9 |
+
from graphgen.bases.base_storage import BaseGraphStorage
|
| 10 |
from graphgen.models import Chunk, OpenAIModel, Tokenizer
|
|
|
|
| 11 |
from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
|
| 12 |
from graphgen.templates import KG_EXTRACTION_PROMPT
|
| 13 |
from graphgen.utils import (
|
graphgen/operators/kg/merge_kg.py
CHANGED
|
@@ -3,8 +3,8 @@ from collections import Counter
|
|
| 3 |
|
| 4 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 5 |
|
|
|
|
| 6 |
from graphgen.models import Tokenizer, TopkTokenModel
|
| 7 |
-
from graphgen.models.storage.base_storage import BaseGraphStorage
|
| 8 |
from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
|
| 9 |
from graphgen.utils import detect_main_language, logger
|
| 10 |
from graphgen.utils.format import split_string_by_multi_markers
|
|
|
|
| 3 |
|
| 4 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 5 |
|
| 6 |
+
from graphgen.bases.base_storage import BaseGraphStorage
|
| 7 |
from graphgen.models import Tokenizer, TopkTokenModel
|
|
|
|
| 8 |
from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
|
| 9 |
from graphgen.utils import detect_main_language, logger
|
| 10 |
from graphgen.utils.format import split_string_by_multi_markers
|
graphgen/utils/__init__.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from .calculate_confidence import yes_no_loss_entropy
|
| 2 |
from .detect_lang import detect_if_chinese, detect_main_language
|
| 3 |
-
from .file import read_file
|
| 4 |
from .format import (
|
| 5 |
format_generation_results,
|
| 6 |
handle_single_entity_extraction,
|
|
|
|
| 1 |
from .calculate_confidence import yes_no_loss_entropy
|
| 2 |
from .detect_lang import detect_if_chinese, detect_main_language
|
|
|
|
| 3 |
from .format import (
|
| 4 |
format_generation_results,
|
| 5 |
handle_single_entity_extraction,
|
graphgen/utils/file.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def read_file(input_file: str) -> list:
|
| 5 |
-
"""
|
| 6 |
-
Read data from a file based on the specified data type.
|
| 7 |
-
:param input_file
|
| 8 |
-
:return:
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
if input_file.endswith(".jsonl"):
|
| 12 |
-
with open(input_file, "r", encoding="utf-8") as f:
|
| 13 |
-
data = [json.loads(line) for line in f]
|
| 14 |
-
elif input_file.endswith(".json"):
|
| 15 |
-
with open(input_file, "r", encoding="utf-8") as f:
|
| 16 |
-
data = json.load(f)
|
| 17 |
-
elif input_file.endswith(".txt"):
|
| 18 |
-
with open(input_file, "r", encoding="utf-8") as f:
|
| 19 |
-
data = [line.strip() for line in f if line.strip()]
|
| 20 |
-
data = [{"content": line} for line in data]
|
| 21 |
-
else:
|
| 22 |
-
raise ValueError(f"Unsupported file format: {input_file}")
|
| 23 |
-
|
| 24 |
-
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
webui/app.py
CHANGED
|
@@ -116,35 +116,6 @@ def run_graphgen(params, progress=gr.Progress()):
|
|
| 116 |
env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
|
| 117 |
)
|
| 118 |
|
| 119 |
-
# Load input data
|
| 120 |
-
file = config["input_file"]
|
| 121 |
-
if isinstance(file, list):
|
| 122 |
-
file = file[0]
|
| 123 |
-
|
| 124 |
-
data = []
|
| 125 |
-
|
| 126 |
-
if file.endswith(".jsonl"):
|
| 127 |
-
config["input_data_type"] = "raw"
|
| 128 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 129 |
-
data.extend(json.loads(line) for line in f)
|
| 130 |
-
elif file.endswith(".json"):
|
| 131 |
-
config["input_data_type"] = "chunked"
|
| 132 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 133 |
-
data.extend(json.load(f))
|
| 134 |
-
elif file.endswith(".txt"):
|
| 135 |
-
# 读取文件后根据chunk_size转成raw格式的数据
|
| 136 |
-
config["input_data_type"] = "raw"
|
| 137 |
-
content = ""
|
| 138 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 139 |
-
lines = f.readlines()
|
| 140 |
-
for line in lines:
|
| 141 |
-
content += line.strip() + " "
|
| 142 |
-
size = int(config.get("chunk_size", 512))
|
| 143 |
-
chunks = [content[i : i + size] for i in range(0, len(content), size)]
|
| 144 |
-
data.extend([{"content": chunk} for chunk in chunks])
|
| 145 |
-
else:
|
| 146 |
-
raise ValueError(f"Unsupported file type: {file}")
|
| 147 |
-
|
| 148 |
# Initialize GraphGen
|
| 149 |
graph_gen = init_graph_gen(config, env)
|
| 150 |
graph_gen.clear()
|
|
@@ -436,19 +407,20 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 436 |
upload_file = gr.File(
|
| 437 |
label=_("Upload File"),
|
| 438 |
file_count="single",
|
| 439 |
-
file_types=[".txt", ".json", ".jsonl"],
|
| 440 |
interactive=True,
|
| 441 |
)
|
| 442 |
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 443 |
gr.Examples(
|
| 444 |
examples=[
|
| 445 |
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 446 |
-
[os.path.join(examples_dir, "
|
| 447 |
-
[os.path.join(examples_dir, "
|
|
|
|
| 448 |
],
|
| 449 |
inputs=upload_file,
|
| 450 |
label=_("Example Files"),
|
| 451 |
-
examples_per_page=
|
| 452 |
)
|
| 453 |
with gr.Column(scale=1):
|
| 454 |
output = gr.File(
|
|
|
|
| 116 |
env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
|
| 117 |
)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# Initialize GraphGen
|
| 120 |
graph_gen = init_graph_gen(config, env)
|
| 121 |
graph_gen.clear()
|
|
|
|
| 407 |
upload_file = gr.File(
|
| 408 |
label=_("Upload File"),
|
| 409 |
file_count="single",
|
| 410 |
+
file_types=[".txt", ".json", ".jsonl", ".csv"],
|
| 411 |
interactive=True,
|
| 412 |
)
|
| 413 |
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 414 |
gr.Examples(
|
| 415 |
examples=[
|
| 416 |
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 417 |
+
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 418 |
+
[os.path.join(examples_dir, "json_demo.json")],
|
| 419 |
+
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 420 |
],
|
| 421 |
inputs=upload_file,
|
| 422 |
label=_("Example Files"),
|
| 423 |
+
examples_per_page=4,
|
| 424 |
)
|
| 425 |
with gr.Column(scale=1):
|
| 426 |
output = gr.File(
|
webui/count_tokens.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
-
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
# pylint: disable=wrong-import-position
|
|
@@ -8,24 +9,29 @@ root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
| 8 |
sys.path.append(root_dir)
|
| 9 |
from graphgen.models import Tokenizer
|
| 10 |
|
|
|
|
| 11 |
def count_tokens(file, tokenizer_name, data_frame):
|
| 12 |
if not file or not os.path.exists(file):
|
| 13 |
return data_frame
|
| 14 |
|
| 15 |
if file.endswith(".jsonl"):
|
| 16 |
-
with open(file, "r", encoding=
|
| 17 |
data = [json.loads(line) for line in f]
|
| 18 |
elif file.endswith(".json"):
|
| 19 |
-
with open(file, "r", encoding=
|
| 20 |
data = json.load(f)
|
| 21 |
data = [item for sublist in data for item in sublist]
|
| 22 |
elif file.endswith(".txt"):
|
| 23 |
-
with open(file, "r", encoding=
|
| 24 |
data = f.read()
|
| 25 |
-
chunks = [
|
| 26 |
-
data[i:i + 512] for i in range(0, len(data), 512)
|
| 27 |
-
]
|
| 28 |
data = [{"content": chunk} for chunk in chunks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
else:
|
| 30 |
raise ValueError(f"Unsupported file type: {file}")
|
| 31 |
|
|
@@ -41,20 +47,13 @@ def count_tokens(file, tokenizer_name, data_frame):
|
|
| 41 |
content = item
|
| 42 |
token_count += len(tokenizer.encode_string(content))
|
| 43 |
|
| 44 |
-
_update_data = [[
|
| 45 |
-
str(token_count),
|
| 46 |
-
str(token_count * 50),
|
| 47 |
-
"N/A"
|
| 48 |
-
]]
|
| 49 |
|
| 50 |
try:
|
| 51 |
-
new_df = pd.DataFrame(
|
| 52 |
-
_update_data,
|
| 53 |
-
columns=data_frame.columns
|
| 54 |
-
)
|
| 55 |
data_frame = new_df
|
| 56 |
|
| 57 |
-
except Exception as e:
|
| 58 |
print("[ERROR] DataFrame操作异常:", str(e))
|
| 59 |
|
| 60 |
return data_frame
|
|
|
|
| 1 |
+
import json
|
| 2 |
import os
|
| 3 |
import sys
|
| 4 |
+
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
# pylint: disable=wrong-import-position
|
|
|
|
| 9 |
sys.path.append(root_dir)
|
| 10 |
from graphgen.models import Tokenizer
|
| 11 |
|
| 12 |
+
|
| 13 |
def count_tokens(file, tokenizer_name, data_frame):
|
| 14 |
if not file or not os.path.exists(file):
|
| 15 |
return data_frame
|
| 16 |
|
| 17 |
if file.endswith(".jsonl"):
|
| 18 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 19 |
data = [json.loads(line) for line in f]
|
| 20 |
elif file.endswith(".json"):
|
| 21 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 22 |
data = json.load(f)
|
| 23 |
data = [item for sublist in data for item in sublist]
|
| 24 |
elif file.endswith(".txt"):
|
| 25 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 26 |
data = f.read()
|
| 27 |
+
chunks = [data[i : i + 512] for i in range(0, len(data), 512)]
|
|
|
|
|
|
|
| 28 |
data = [{"content": chunk} for chunk in chunks]
|
| 29 |
+
elif file.endswith(".csv"):
|
| 30 |
+
df = pd.read_csv(file)
|
| 31 |
+
if "content" in df.columns:
|
| 32 |
+
data = df["content"].tolist()
|
| 33 |
+
else:
|
| 34 |
+
data = df.iloc[:, 0].tolist()
|
| 35 |
else:
|
| 36 |
raise ValueError(f"Unsupported file type: {file}")
|
| 37 |
|
|
|
|
| 47 |
content = item
|
| 48 |
token_count += len(tokenizer.encode_string(content))
|
| 49 |
|
| 50 |
+
_update_data = [[str(token_count), str(token_count * 50), "N/A"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
try:
|
| 53 |
+
new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
|
|
|
|
|
|
|
|
|
|
| 54 |
data_frame = new_df
|
| 55 |
|
| 56 |
+
except Exception as e: # pylint: disable=broad-except
|
| 57 |
print("[ERROR] DataFrame操作异常:", str(e))
|
| 58 |
|
| 59 |
return data_frame
|
webui/examples/chunked_demo.json
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
[
|
| 3 |
-
{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
|
| 4 |
-
],
|
| 5 |
-
[
|
| 6 |
-
{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}
|
| 7 |
-
],
|
| 8 |
-
[
|
| 9 |
-
{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
|
| 10 |
-
],
|
| 11 |
-
[
|
| 12 |
-
{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
|
| 13 |
-
]
|
| 14 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
webui/examples/csv_demo.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
content
|
| 2 |
+
"云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"
|
| 3 |
+
"隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"
|
| 4 |
+
"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."
|
| 5 |
+
"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."
|
webui/examples/json_demo.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
|
| 3 |
+
{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"},
|
| 4 |
+
{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
|
| 5 |
+
{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
|
| 6 |
+
]
|
webui/examples/{raw_demo.jsonl → jsonl_demo.jsonl}
RENAMED
|
File without changes
|