github-actions[bot] commited on
Commit
d2a63cc
·
1 Parent(s): 4e14cb8

Auto-sync from demo at Tue Sep 23 09:05:50 UTC 2025

Browse files
app.py CHANGED
@@ -116,35 +116,6 @@ def run_graphgen(params, progress=gr.Progress()):
116
  env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
117
  )
118
 
119
- # Load input data
120
- file = config["input_file"]
121
- if isinstance(file, list):
122
- file = file[0]
123
-
124
- data = []
125
-
126
- if file.endswith(".jsonl"):
127
- config["input_data_type"] = "raw"
128
- with open(file, "r", encoding="utf-8") as f:
129
- data.extend(json.loads(line) for line in f)
130
- elif file.endswith(".json"):
131
- config["input_data_type"] = "chunked"
132
- with open(file, "r", encoding="utf-8") as f:
133
- data.extend(json.load(f))
134
- elif file.endswith(".txt"):
135
- # 读取文件后根据chunk_size转成raw格式的数据
136
- config["input_data_type"] = "raw"
137
- content = ""
138
- with open(file, "r", encoding="utf-8") as f:
139
- lines = f.readlines()
140
- for line in lines:
141
- content += line.strip() + " "
142
- size = int(config.get("chunk_size", 512))
143
- chunks = [content[i : i + size] for i in range(0, len(content), size)]
144
- data.extend([{"content": chunk} for chunk in chunks])
145
- else:
146
- raise ValueError(f"Unsupported file type: {file}")
147
-
148
  # Initialize GraphGen
149
  graph_gen = init_graph_gen(config, env)
150
  graph_gen.clear()
@@ -436,19 +407,20 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
436
  upload_file = gr.File(
437
  label=_("Upload File"),
438
  file_count="single",
439
- file_types=[".txt", ".json", ".jsonl"],
440
  interactive=True,
441
  )
442
  examples_dir = os.path.join(root_dir, "webui", "examples")
443
  gr.Examples(
444
  examples=[
445
  [os.path.join(examples_dir, "txt_demo.txt")],
446
- [os.path.join(examples_dir, "raw_demo.jsonl")],
447
- [os.path.join(examples_dir, "chunked_demo.json")],
 
448
  ],
449
  inputs=upload_file,
450
  label=_("Example Files"),
451
- examples_per_page=3,
452
  )
453
  with gr.Column(scale=1):
454
  output = gr.File(
 
116
  env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
117
  )
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Initialize GraphGen
120
  graph_gen = init_graph_gen(config, env)
121
  graph_gen.clear()
 
407
  upload_file = gr.File(
408
  label=_("Upload File"),
409
  file_count="single",
410
+ file_types=[".txt", ".json", ".jsonl", ".csv"],
411
  interactive=True,
412
  )
413
  examples_dir = os.path.join(root_dir, "webui", "examples")
414
  gr.Examples(
415
  examples=[
416
  [os.path.join(examples_dir, "txt_demo.txt")],
417
+ [os.path.join(examples_dir, "jsonl_demo.jsonl")],
418
+ [os.path.join(examples_dir, "json_demo.json")],
419
+ [os.path.join(examples_dir, "csv_demo.csv")],
420
  ],
421
  inputs=upload_file,
422
  label=_("Example Files"),
423
+ examples_per_page=4,
424
  )
425
  with gr.Column(scale=1):
426
  output = gr.File(
graphgen/{version.py → _version.py} RENAMED
@@ -1,7 +1,6 @@
1
-
2
  from typing import Tuple
3
 
4
- __version__ = '20250416'
5
  short_version = __version__
6
 
7
 
@@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple:
15
  tuple: A sequence of integer and string represents version.
16
  """
17
  _version_info = []
18
- for x in version_str.split('.'):
19
  if x.isdigit():
20
  _version_info.append(int(x))
21
- elif x.find('rc') != -1:
22
- patch_version = x.split('rc')
23
  _version_info.append(int(patch_version[0]))
24
- _version_info.append(f'rc{patch_version[1]}')
25
  return tuple(_version_info)
26
 
27
 
 
 
1
  from typing import Tuple
2
 
3
+ __version__ = "20250416"
4
  short_version = __version__
5
 
6
 
 
14
  tuple: A sequence of integer and string represents version.
15
  """
16
  _version_info = []
17
+ for x in version_str.split("."):
18
  if x.isdigit():
19
  _version_info.append(int(x))
20
+ elif x.find("rc") != -1:
21
+ patch_version = x.split("rc")
22
  _version_info.append(int(patch_version[0]))
23
+ _version_info.append(f"rc{patch_version[1]}")
24
  return tuple(_version_info)
25
 
26
 
graphgen/{models/embed → bases}/__init__.py RENAMED
File without changes
graphgen/bases/base_reader.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List
3
+
4
+
5
+ class BaseReader(ABC):
6
+ """
7
+ Abstract base class for reading and processing data.
8
+ """
9
+
10
+ def __init__(self, text_column: str = "content"):
11
+ self.text_column = text_column
12
+
13
+ @abstractmethod
14
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
15
+ """
16
+ Read data from the specified file path.
17
+
18
+ :param file_path: Path to the input file.
19
+ :return: List of dictionaries containing the data.
20
+ """
graphgen/{models/storage → bases}/base_storage.py RENAMED
@@ -1,8 +1,6 @@
1
  from dataclasses import dataclass
2
  from typing import Generic, TypeVar, Union
3
 
4
- from graphgen.models.embed.embedding import EmbeddingFunc
5
-
6
  T = TypeVar("T")
7
 
8
 
@@ -62,8 +60,6 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
62
 
63
  @dataclass
64
  class BaseGraphStorage(StorageNameSpace):
65
- embedding_func: EmbeddingFunc = None
66
-
67
  async def has_node(self, node_id: str) -> bool:
68
  raise NotImplementedError
69
 
 
1
  from dataclasses import dataclass
2
  from typing import Generic, TypeVar, Union
3
 
 
 
4
  T = TypeVar("T")
5
 
6
 
 
60
 
61
  @dataclass
62
  class BaseGraphStorage(StorageNameSpace):
 
 
63
  async def has_node(self, node_id: str) -> bool:
64
  raise NotImplementedError
65
 
graphgen/configs/aggregated_config.yaml CHANGED
@@ -1,5 +1,4 @@
1
- input_data_type: raw # raw, chunked
2
- input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
3
  output_data_type: aggregated # atomic, aggregated, multi_hop, cot
4
  output_data_format: ChatML # Alpaca, Sharegpt, ChatML
5
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
 
1
+ input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
 
2
  output_data_type: aggregated # atomic, aggregated, multi_hop, cot
3
  output_data_format: ChatML # Alpaca, Sharegpt, ChatML
4
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
graphgen/configs/atomic_config.yaml CHANGED
@@ -1,5 +1,4 @@
1
- input_data_type: raw # raw, chunked
2
- input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
3
  output_data_type: atomic # atomic, aggregated, multi_hop, cot
4
  output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
5
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
 
1
+ input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
 
2
  output_data_type: atomic # atomic, aggregated, multi_hop, cot
3
  output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
4
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
graphgen/configs/cot_config.yaml CHANGED
@@ -1,5 +1,4 @@
1
- input_data_type: raw # raw, chunked
2
- input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
3
  output_data_type: cot # atomic, aggregated, multi_hop, cot
4
  output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
5
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
 
1
+ input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples
 
2
  output_data_type: cot # atomic, aggregated, multi_hop, cot
3
  output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
4
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
graphgen/configs/multi_hop_config.yaml CHANGED
@@ -1,5 +1,4 @@
1
- input_data_type: raw # raw, chunked
2
- input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
3
  output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
4
  output_data_format: ChatML # Alpaca, Sharegpt, ChatML
5
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
 
1
+ input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
 
2
  output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
3
  output_data_format: ChatML # Alpaca, Sharegpt, ChatML
4
  tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
graphgen/graphgen.py CHANGED
@@ -7,7 +7,8 @@ from typing import Dict, List, Union, cast
7
  import gradio as gr
8
  from tqdm.asyncio import tqdm as tqdm_async
9
 
10
- from .models import (
 
11
  Chunk,
12
  JsonKVStorage,
13
  JsonListStorage,
@@ -15,8 +16,9 @@ from .models import (
15
  OpenAIModel,
16
  Tokenizer,
17
  TraverseStrategy,
 
18
  )
19
- from .models.storage.base_storage import StorageNameSpace
20
  from .operators import (
21
  extract_kg,
22
  generate_cot,
@@ -32,7 +34,6 @@ from .utils import (
32
  create_event_loop,
33
  format_generation_results,
34
  logger,
35
- read_file,
36
  )
37
 
38
  sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -108,94 +109,54 @@ class GraphGen:
108
  namespace=f"qa-{self.unique_id}",
109
  )
110
 
111
- async def async_split_chunks(
112
- self, data: List[Union[List, Dict]], data_type: str
113
- ) -> dict:
114
  # TODO: configurable whether to use coreference resolution
115
  if len(data) == 0:
116
  return {}
117
 
118
  inserting_chunks = {}
119
- if data_type == "raw":
120
- assert isinstance(data, list) and isinstance(data[0], dict)
121
- # compute hash for each document
122
- new_docs = {
123
- compute_content_hash(doc["content"], prefix="doc-"): {
124
- "content": doc["content"]
125
- }
126
- for doc in data
127
- }
128
- _add_doc_keys = await self.full_docs_storage.filter_keys(
129
- list(new_docs.keys())
130
- )
131
- new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
132
- if len(new_docs) == 0:
133
- logger.warning("All docs are already in the storage")
134
- return {}
135
- logger.info("[New Docs] inserting %d docs", len(new_docs))
136
-
137
- cur_index = 1
138
- doc_number = len(new_docs)
139
- async for doc_key, doc in tqdm_async(
140
- new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
141
- ):
142
- chunks = {
143
- compute_content_hash(dp["content"], prefix="chunk-"): {
144
- **dp,
145
- "full_doc_id": doc_key,
146
- }
147
- for dp in self.tokenizer_instance.chunk_by_token_size(
148
- doc["content"], self.chunk_overlap_size, self.chunk_size
149
- )
150
- }
151
- inserting_chunks.update(chunks)
152
-
153
- if self.progress_bar is not None:
154
- self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
155
- cur_index += 1
156
 
157
- _add_chunk_keys = await self.text_chunks_storage.filter_keys(
158
- list(inserting_chunks.keys())
159
- )
160
- inserting_chunks = {
161
- k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
162
  }
163
- elif data_type == "chunked":
164
- assert isinstance(data, list) and isinstance(data[0], list)
165
- new_docs = {
166
- compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
167
- "content": "".join(chunk["content"])
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }
169
- for doc in data
170
- for chunk in doc
171
- }
172
- _add_doc_keys = await self.full_docs_storage.filter_keys(
173
- list(new_docs.keys())
174
- )
175
- new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
176
- if len(new_docs) == 0:
177
- logger.warning("All docs are already in the storage")
178
- return {}
179
- logger.info("[New Docs] inserting %d docs", len(new_docs))
180
- async for doc in tqdm_async(
181
- data, desc="[1/4]Chunking documents", unit="doc"
182
- ):
183
- doc_str = "".join([chunk["content"] for chunk in doc])
184
- for chunk in doc:
185
- chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
186
- inserting_chunks[chunk_key] = {
187
- **chunk,
188
- "full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
189
- }
190
- _add_chunk_keys = await self.text_chunks_storage.filter_keys(
191
- list(inserting_chunks.keys())
192
- )
193
- inserting_chunks = {
194
- k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
195
  }
196
- else:
197
- raise ValueError(f"Unknown data type: {data_type}")
 
 
 
198
 
 
 
 
 
 
 
199
  await self.full_docs_storage.upsert(new_docs)
200
  await self.text_chunks_storage.upsert(inserting_chunks)
201
 
@@ -211,10 +172,8 @@ class GraphGen:
211
  """
212
 
213
  input_file = self.config["input_file"]
214
- data_type = self.config["input_data_type"]
215
  data = read_file(input_file)
216
-
217
- inserting_chunks = await self.async_split_chunks(data, data_type)
218
 
219
  if len(inserting_chunks) == 0:
220
  logger.warning("All chunks are already in the storage")
 
7
  import gradio as gr
8
  from tqdm.asyncio import tqdm as tqdm_async
9
 
10
+ from graphgen.bases.base_storage import StorageNameSpace
11
+ from graphgen.models import (
12
  Chunk,
13
  JsonKVStorage,
14
  JsonListStorage,
 
16
  OpenAIModel,
17
  Tokenizer,
18
  TraverseStrategy,
19
+ read_file,
20
  )
21
+
22
  from .operators import (
23
  extract_kg,
24
  generate_cot,
 
34
  create_event_loop,
35
  format_generation_results,
36
  logger,
 
37
  )
38
 
39
  sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 
109
  namespace=f"qa-{self.unique_id}",
110
  )
111
 
112
+ async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
 
 
113
  # TODO: configurable whether to use coreference resolution
114
  if len(data) == 0:
115
  return {}
116
 
117
  inserting_chunks = {}
118
+ assert isinstance(data, list) and isinstance(data[0], dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # compute hash for each document
121
+ new_docs = {
122
+ compute_content_hash(doc["content"], prefix="doc-"): {
123
+ "content": doc["content"]
 
124
  }
125
+ for doc in data
126
+ }
127
+ _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
128
+ new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
129
+ if len(new_docs) == 0:
130
+ logger.warning("All docs are already in the storage")
131
+ return {}
132
+ logger.info("[New Docs] inserting %d docs", len(new_docs))
133
+
134
+ cur_index = 1
135
+ doc_number = len(new_docs)
136
+ async for doc_key, doc in tqdm_async(
137
+ new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
138
+ ):
139
+ chunks = {
140
+ compute_content_hash(dp["content"], prefix="chunk-"): {
141
+ **dp,
142
+ "full_doc_id": doc_key,
143
  }
144
+ for dp in self.tokenizer_instance.chunk_by_token_size(
145
+ doc["content"], self.chunk_overlap_size, self.chunk_size
146
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
+ inserting_chunks.update(chunks)
149
+
150
+ if self.progress_bar is not None:
151
+ self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
152
+ cur_index += 1
153
 
154
+ _add_chunk_keys = await self.text_chunks_storage.filter_keys(
155
+ list(inserting_chunks.keys())
156
+ )
157
+ inserting_chunks = {
158
+ k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
159
+ }
160
  await self.full_docs_storage.upsert(new_docs)
161
  await self.text_chunks_storage.upsert(inserting_chunks)
162
 
 
172
  """
173
 
174
  input_file = self.config["input_file"]
 
175
  data = read_file(input_file)
176
+ inserting_chunks = await self.async_split_chunks(data)
 
177
 
178
  if len(inserting_chunks) == 0:
179
  logger.warning("All chunks are already in the storage")
graphgen/judge.py DELETED
@@ -1,60 +0,0 @@
1
- import os
2
- import argparse
3
- import asyncio
4
- from dotenv import load_dotenv
5
-
6
- from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
7
- from .operators import judge_statement
8
-
9
- sys_path = os.path.abspath(os.path.dirname(__file__))
10
-
11
- load_dotenv()
12
-
13
- def calculate_average_loss(graph: NetworkXStorage):
14
- """
15
- Calculate the average loss of the graph.
16
-
17
- :param graph: NetworkXStorage
18
- :return: float
19
- """
20
- edges = asyncio.run(graph.get_all_edges())
21
- total_loss = 0
22
- for edge in edges:
23
- total_loss += edge[2]['loss']
24
- return total_loss / len(edges)
25
-
26
-
27
-
28
- if __name__ == '__main__':
29
- parser = argparse.ArgumentParser()
30
- parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
31
- parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
32
-
33
- args = parser.parse_args()
34
-
35
- llm_client = OpenAIModel(
36
- model_name=os.getenv("TRAINEE_MODEL"),
37
- api_key=os.getenv("TRAINEE_API_KEY"),
38
- base_url=os.getenv("TRAINEE_BASE_URL")
39
- )
40
-
41
- graph_storage = NetworkXStorage(
42
- args.input,
43
- namespace="graph"
44
- )
45
- average_loss = calculate_average_loss(graph_storage)
46
- print(f"Average loss of the graph: {average_loss}")
47
-
48
- rephrase_storage = JsonKVStorage(
49
- os.path.join(sys_path, "cache"),
50
- namespace="rephrase"
51
- )
52
-
53
- new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
54
-
55
- graph_file = asyncio.run(graph_storage.get_graph())
56
-
57
- new_graph.write_nx_graph(graph_file, args.output)
58
-
59
- average_loss = calculate_average_loss(new_graph)
60
- print(f"Average loss of the graph: {average_loss}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
graphgen/models/__init__.py CHANGED
@@ -6,6 +6,7 @@ from .evaluate.uni_evaluator import UniEvaluator
6
  from .llm.openai_model import OpenAIModel
7
  from .llm.tokenizer import Tokenizer
8
  from .llm.topk_token_model import Token, TopkTokenModel
 
9
  from .search.db.uniprot_search import UniProtSearch
10
  from .search.kg.wiki_search import WikiSearch
11
  from .search.web.bing_search import BingSearch
 
6
  from .llm.openai_model import OpenAIModel
7
  from .llm.tokenizer import Tokenizer
8
  from .llm.topk_token_model import Token, TopkTokenModel
9
+ from .reader import read_file
10
  from .search.db.uniprot_search import UniProtSearch
11
  from .search.kg.wiki_search import WikiSearch
12
  from .search.web.bing_search import BingSearch
graphgen/models/embed/embedding.py DELETED
@@ -1,29 +0,0 @@
1
- from dataclasses import dataclass
2
- import asyncio
3
- import numpy as np
4
-
5
- class UnlimitedSemaphore:
6
- """A context manager that allows unlimited access."""
7
-
8
- async def __aenter__(self):
9
- pass
10
-
11
- async def __aexit__(self, exc_type, exc, tb):
12
- pass
13
-
14
- @dataclass
15
- class EmbeddingFunc:
16
- embedding_dim: int
17
- max_token_size: int
18
- func: callable
19
- concurrent_limit: int = 16
20
-
21
- def __post_init__(self):
22
- if self.concurrent_limit != 0:
23
- self._semaphore = asyncio.Semaphore(self.concurrent_limit)
24
- else:
25
- self._semaphore = UnlimitedSemaphore()
26
-
27
- async def __call__(self, *args, **kwargs) -> np.ndarray:
28
- async with self._semaphore:
29
- return await self.func(*args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
graphgen/models/reader/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .csv_reader import CsvReader
2
+ from .json_reader import JsonReader
3
+ from .jsonl_reader import JsonlReader
4
+ from .txt_reader import TxtReader
5
+
6
+ _MAPPING = {
7
+ "jsonl": JsonlReader,
8
+ "json": JsonReader,
9
+ "txt": TxtReader,
10
+ "csv": CsvReader,
11
+ }
12
+
13
+
14
+ def read_file(file_path: str):
15
+ suffix = file_path.split(".")[-1]
16
+ if suffix in _MAPPING:
17
+ reader = _MAPPING[suffix]()
18
+ else:
19
+ raise ValueError(
20
+ f"Unsupported file format: {suffix}. Supported formats are: {list(_MAPPING.keys())}"
21
+ )
22
+ return reader.read(file_path)
graphgen/models/reader/csv_reader.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ import pandas as pd
4
+
5
+ from graphgen.bases.base_reader import BaseReader
6
+
7
+
8
+ class CsvReader(BaseReader):
9
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
10
+
11
+ df = pd.read_csv(file_path)
12
+ if self.text_column not in df.columns:
13
+ raise ValueError(f"Missing '{self.text_column}' column in CSV file.")
14
+ return df.to_dict(orient="records")
graphgen/models/reader/json_reader.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from graphgen.bases.base_reader import BaseReader
5
+
6
+
7
+ class JsonReader(BaseReader):
8
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
9
+ with open(file_path, "r", encoding="utf-8") as f:
10
+ data = json.load(f)
11
+ if isinstance(data, list):
12
+ for doc in data:
13
+ if self.text_column not in doc:
14
+ raise ValueError(
15
+ f"Missing '{self.text_column}' in document: {doc}"
16
+ )
17
+ return data
18
+ raise ValueError("JSON file must contain a list of documents.")
graphgen/models/reader/jsonl_reader.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from graphgen.bases.base_reader import BaseReader
5
+ from graphgen.utils import logger
6
+
7
+
8
+ class JsonlReader(BaseReader):
9
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
10
+ docs = []
11
+ with open(file_path, "r", encoding="utf-8") as f:
12
+ for line in f:
13
+ try:
14
+ doc = json.loads(line)
15
+ if self.text_column in doc:
16
+ docs.append(doc)
17
+ else:
18
+ raise ValueError(
19
+ f"Missing '{self.text_column}' in document: {doc}"
20
+ )
21
+ except json.JSONDecodeError as e:
22
+ logger.error("Error decoding JSON line: %s. Error: %s", line, e)
23
+ return docs
graphgen/models/reader/txt_reader.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ from graphgen.bases.base_reader import BaseReader
4
+
5
+
6
+ class TxtReader(BaseReader):
7
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
8
+ docs = []
9
+ with open(file_path, "r", encoding="utf-8") as f:
10
+ for line in f:
11
+ line = line.strip()
12
+ if line:
13
+ docs.append({self.text_column: line})
14
+ return docs
graphgen/models/storage/json_storage.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  from dataclasses import dataclass
3
 
4
- from graphgen.models.storage.base_storage import BaseKVStorage, BaseListStorage
5
  from graphgen.utils import load_json, logger, write_json
6
 
7
 
 
1
  import os
2
  from dataclasses import dataclass
3
 
4
+ from graphgen.bases.base_storage import BaseKVStorage, BaseListStorage
5
  from graphgen.utils import load_json, logger, write_json
6
 
7
 
graphgen/models/storage/networkx_storage.py CHANGED
@@ -1,11 +1,13 @@
1
- import os
2
  import html
3
- from typing import Any, Union, cast, Optional
4
  from dataclasses import dataclass
 
 
5
  import networkx as nx
6
 
 
7
  from graphgen.utils import logger
8
- from .base_storage import BaseGraphStorage
9
 
10
  @dataclass
11
  class NetworkXStorage(BaseGraphStorage):
@@ -17,7 +19,11 @@ class NetworkXStorage(BaseGraphStorage):
17
 
18
  @staticmethod
19
  def write_nx_graph(graph: nx.Graph, file_name):
20
- logger.info("Writing graph with %d nodes, %d edges", graph.number_of_nodes(), graph.number_of_edges())
 
 
 
 
21
  nx.write_graphml(graph, file_name)
22
 
23
  @staticmethod
@@ -77,8 +83,10 @@ class NetworkXStorage(BaseGraphStorage):
77
  preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
78
  if preloaded_graph is not None:
79
  logger.info(
80
- "Loaded graph from %s with %d nodes, %d edges", self._graphml_xml_file,
81
- preloaded_graph.number_of_nodes(), preloaded_graph.number_of_edges()
 
 
82
  )
83
  self._graph = preloaded_graph or nx.Graph()
84
 
@@ -111,7 +119,9 @@ class NetworkXStorage(BaseGraphStorage):
111
  async def get_all_edges(self) -> Union[list[dict], None]:
112
  return self._graph.edges(data=True)
113
 
114
- async def get_node_edges(self, source_node_id: str) -> Union[list[tuple[str, str]], None]:
 
 
115
  if self._graph.has_node(source_node_id):
116
  return list(self._graph.edges(source_node_id, data=True))
117
  return None
@@ -133,11 +143,17 @@ class NetworkXStorage(BaseGraphStorage):
133
  ):
134
  self._graph.add_edge(source_node_id, target_node_id, **edge_data)
135
 
136
- async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
 
 
137
  if self._graph.has_edge(source_node_id, target_node_id):
138
  self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
139
  else:
140
- logger.warning("Edge %s -> %s not found in the graph for update.", source_node_id, target_node_id)
 
 
 
 
141
 
142
  async def delete_node(self, node_id: str):
143
  """
 
 
1
  import html
2
+ import os
3
  from dataclasses import dataclass
4
+ from typing import Any, Optional, Union, cast
5
+
6
  import networkx as nx
7
 
8
+ from graphgen.bases.base_storage import BaseGraphStorage
9
  from graphgen.utils import logger
10
+
11
 
12
  @dataclass
13
  class NetworkXStorage(BaseGraphStorage):
 
19
 
20
  @staticmethod
21
  def write_nx_graph(graph: nx.Graph, file_name):
22
+ logger.info(
23
+ "Writing graph with %d nodes, %d edges",
24
+ graph.number_of_nodes(),
25
+ graph.number_of_edges(),
26
+ )
27
  nx.write_graphml(graph, file_name)
28
 
29
  @staticmethod
 
83
  preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
84
  if preloaded_graph is not None:
85
  logger.info(
86
+ "Loaded graph from %s with %d nodes, %d edges",
87
+ self._graphml_xml_file,
88
+ preloaded_graph.number_of_nodes(),
89
+ preloaded_graph.number_of_edges(),
90
  )
91
  self._graph = preloaded_graph or nx.Graph()
92
 
 
119
  async def get_all_edges(self) -> Union[list[dict], None]:
120
  return self._graph.edges(data=True)
121
 
122
+ async def get_node_edges(
123
+ self, source_node_id: str
124
+ ) -> Union[list[tuple[str, str]], None]:
125
  if self._graph.has_node(source_node_id):
126
  return list(self._graph.edges(source_node_id, data=True))
127
  return None
 
143
  ):
144
  self._graph.add_edge(source_node_id, target_node_id, **edge_data)
145
 
146
+ async def update_edge(
147
+ self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
148
+ ):
149
  if self._graph.has_edge(source_node_id, target_node_id):
150
  self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
151
  else:
152
+ logger.warning(
153
+ "Edge %s -> %s not found in the graph for update.",
154
+ source_node_id,
155
+ target_node_id,
156
+ )
157
 
158
  async def delete_node(self, node_id: str):
159
  """
graphgen/models/strategy/base_strategy.py DELETED
@@ -1,5 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- @dataclass
4
- class BaseStrategy:
5
- pass
 
 
 
 
 
 
graphgen/models/strategy/travserse_strategy.py CHANGED
@@ -1,14 +1,12 @@
1
  from dataclasses import dataclass, fields
2
 
3
- from graphgen.models.strategy.base_strategy import BaseStrategy
4
-
5
 
6
  @dataclass
7
- class TraverseStrategy(BaseStrategy):
8
  # 生成的QA形式:原子、多跳、聚合型
9
- qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
10
  # 最大边数和最大token数方法中选择一个生效
11
- expand_method: str = "max_tokens" # "max_width" or "max_tokens"
12
  # 单向拓展还是双向拓展
13
  bidirectional: bool = True
14
  # 每个方向拓展的最大边数
@@ -18,9 +16,9 @@ class TraverseStrategy(BaseStrategy):
18
  # 每个方向拓展的最大深度
19
  max_depth: int = 2
20
  # 同一层中选边的策略(如果是双向拓展,同一层指的是两边连接的边的集合)
21
- edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
22
  # 孤立节点的处理策略
23
- isolated_node_strategy: str = "add" # "add" or "ignore"
24
  loss_strategy: str = "only_edge" # only_edge, both
25
 
26
  def to_yaml(self):
 
1
  from dataclasses import dataclass, fields
2
 
 
 
3
 
4
  @dataclass
5
+ class TraverseStrategy:
6
  # 生成的QA形式:原子、多跳、聚合型
7
+ qa_form: str = "atomic" # "atomic" or "multi_hop" or "aggregated"
8
  # 最大边数和最大token数方法中选择一个生效
9
+ expand_method: str = "max_tokens" # "max_width" or "max_tokens"
10
  # 单向拓展还是双向拓展
11
  bidirectional: bool = True
12
  # 每个方向拓展的最大边数
 
16
  # 每个方向拓展的最大深度
17
  max_depth: int = 2
18
  # 同一层中选边的策略(如果是双向拓展,同一层指的是两边连接的边的集合)
19
+ edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
20
  # 孤立节点的处理策略
21
+ isolated_node_strategy: str = "add" # "add" or "ignore"
22
  loss_strategy: str = "only_edge" # only_edge, both
23
 
24
  def to_yaml(self):
graphgen/operators/kg/extract_kg.py CHANGED
@@ -6,8 +6,8 @@ from typing import List
6
  import gradio as gr
7
  from tqdm.asyncio import tqdm as tqdm_async
8
 
 
9
  from graphgen.models import Chunk, OpenAIModel, Tokenizer
10
- from graphgen.models.storage.base_storage import BaseGraphStorage
11
  from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
12
  from graphgen.templates import KG_EXTRACTION_PROMPT
13
  from graphgen.utils import (
 
6
  import gradio as gr
7
  from tqdm.asyncio import tqdm as tqdm_async
8
 
9
+ from graphgen.bases.base_storage import BaseGraphStorage
10
  from graphgen.models import Chunk, OpenAIModel, Tokenizer
 
11
  from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
12
  from graphgen.templates import KG_EXTRACTION_PROMPT
13
  from graphgen.utils import (
graphgen/operators/kg/merge_kg.py CHANGED
@@ -3,8 +3,8 @@ from collections import Counter
3
 
4
  from tqdm.asyncio import tqdm as tqdm_async
5
 
 
6
  from graphgen.models import Tokenizer, TopkTokenModel
7
- from graphgen.models.storage.base_storage import BaseGraphStorage
8
  from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
9
  from graphgen.utils import detect_main_language, logger
10
  from graphgen.utils.format import split_string_by_multi_markers
 
3
 
4
  from tqdm.asyncio import tqdm as tqdm_async
5
 
6
+ from graphgen.bases.base_storage import BaseGraphStorage
7
  from graphgen.models import Tokenizer, TopkTokenModel
 
8
  from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
9
  from graphgen.utils import detect_main_language, logger
10
  from graphgen.utils.format import split_string_by_multi_markers
graphgen/utils/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
  from .calculate_confidence import yes_no_loss_entropy
2
  from .detect_lang import detect_if_chinese, detect_main_language
3
- from .file import read_file
4
  from .format import (
5
  format_generation_results,
6
  handle_single_entity_extraction,
 
1
  from .calculate_confidence import yes_no_loss_entropy
2
  from .detect_lang import detect_if_chinese, detect_main_language
 
3
  from .format import (
4
  format_generation_results,
5
  handle_single_entity_extraction,
graphgen/utils/file.py DELETED
@@ -1,24 +0,0 @@
1
- import json
2
-
3
-
4
- def read_file(input_file: str) -> list:
5
- """
6
- Read data from a file based on the specified data type.
7
- :param input_file
8
- :return:
9
- """
10
-
11
- if input_file.endswith(".jsonl"):
12
- with open(input_file, "r", encoding="utf-8") as f:
13
- data = [json.loads(line) for line in f]
14
- elif input_file.endswith(".json"):
15
- with open(input_file, "r", encoding="utf-8") as f:
16
- data = json.load(f)
17
- elif input_file.endswith(".txt"):
18
- with open(input_file, "r", encoding="utf-8") as f:
19
- data = [line.strip() for line in f if line.strip()]
20
- data = [{"content": line} for line in data]
21
- else:
22
- raise ValueError(f"Unsupported file format: {input_file}")
23
-
24
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webui/app.py CHANGED
@@ -116,35 +116,6 @@ def run_graphgen(params, progress=gr.Progress()):
116
  env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
117
  )
118
 
119
- # Load input data
120
- file = config["input_file"]
121
- if isinstance(file, list):
122
- file = file[0]
123
-
124
- data = []
125
-
126
- if file.endswith(".jsonl"):
127
- config["input_data_type"] = "raw"
128
- with open(file, "r", encoding="utf-8") as f:
129
- data.extend(json.loads(line) for line in f)
130
- elif file.endswith(".json"):
131
- config["input_data_type"] = "chunked"
132
- with open(file, "r", encoding="utf-8") as f:
133
- data.extend(json.load(f))
134
- elif file.endswith(".txt"):
135
- # 读取文件后根据chunk_size转成raw格式的数据
136
- config["input_data_type"] = "raw"
137
- content = ""
138
- with open(file, "r", encoding="utf-8") as f:
139
- lines = f.readlines()
140
- for line in lines:
141
- content += line.strip() + " "
142
- size = int(config.get("chunk_size", 512))
143
- chunks = [content[i : i + size] for i in range(0, len(content), size)]
144
- data.extend([{"content": chunk} for chunk in chunks])
145
- else:
146
- raise ValueError(f"Unsupported file type: {file}")
147
-
148
  # Initialize GraphGen
149
  graph_gen = init_graph_gen(config, env)
150
  graph_gen.clear()
@@ -436,19 +407,20 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
436
  upload_file = gr.File(
437
  label=_("Upload File"),
438
  file_count="single",
439
- file_types=[".txt", ".json", ".jsonl"],
440
  interactive=True,
441
  )
442
  examples_dir = os.path.join(root_dir, "webui", "examples")
443
  gr.Examples(
444
  examples=[
445
  [os.path.join(examples_dir, "txt_demo.txt")],
446
- [os.path.join(examples_dir, "raw_demo.jsonl")],
447
- [os.path.join(examples_dir, "chunked_demo.json")],
 
448
  ],
449
  inputs=upload_file,
450
  label=_("Example Files"),
451
- examples_per_page=3,
452
  )
453
  with gr.Column(scale=1):
454
  output = gr.File(
 
116
  env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
117
  )
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Initialize GraphGen
120
  graph_gen = init_graph_gen(config, env)
121
  graph_gen.clear()
 
407
  upload_file = gr.File(
408
  label=_("Upload File"),
409
  file_count="single",
410
+ file_types=[".txt", ".json", ".jsonl", ".csv"],
411
  interactive=True,
412
  )
413
  examples_dir = os.path.join(root_dir, "webui", "examples")
414
  gr.Examples(
415
  examples=[
416
  [os.path.join(examples_dir, "txt_demo.txt")],
417
+ [os.path.join(examples_dir, "jsonl_demo.jsonl")],
418
+ [os.path.join(examples_dir, "json_demo.json")],
419
+ [os.path.join(examples_dir, "csv_demo.csv")],
420
  ],
421
  inputs=upload_file,
422
  label=_("Example Files"),
423
+ examples_per_page=4,
424
  )
425
  with gr.Column(scale=1):
426
  output = gr.File(
webui/count_tokens.py CHANGED
@@ -1,6 +1,7 @@
 
1
  import os
2
  import sys
3
- import json
4
  import pandas as pd
5
 
6
  # pylint: disable=wrong-import-position
@@ -8,24 +9,29 @@ root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
  sys.path.append(root_dir)
9
  from graphgen.models import Tokenizer
10
 
 
11
  def count_tokens(file, tokenizer_name, data_frame):
12
  if not file or not os.path.exists(file):
13
  return data_frame
14
 
15
  if file.endswith(".jsonl"):
16
- with open(file, "r", encoding='utf-8') as f:
17
  data = [json.loads(line) for line in f]
18
  elif file.endswith(".json"):
19
- with open(file, "r", encoding='utf-8') as f:
20
  data = json.load(f)
21
  data = [item for sublist in data for item in sublist]
22
  elif file.endswith(".txt"):
23
- with open(file, "r", encoding='utf-8') as f:
24
  data = f.read()
25
- chunks = [
26
- data[i:i + 512] for i in range(0, len(data), 512)
27
- ]
28
  data = [{"content": chunk} for chunk in chunks]
 
 
 
 
 
 
29
  else:
30
  raise ValueError(f"Unsupported file type: {file}")
31
 
@@ -41,20 +47,13 @@ def count_tokens(file, tokenizer_name, data_frame):
41
  content = item
42
  token_count += len(tokenizer.encode_string(content))
43
 
44
- _update_data = [[
45
- str(token_count),
46
- str(token_count * 50),
47
- "N/A"
48
- ]]
49
 
50
  try:
51
- new_df = pd.DataFrame(
52
- _update_data,
53
- columns=data_frame.columns
54
- )
55
  data_frame = new_df
56
 
57
- except Exception as e: # pylint: disable=broad-except
58
  print("[ERROR] DataFrame操作异常:", str(e))
59
 
60
  return data_frame
 
1
+ import json
2
  import os
3
  import sys
4
+
5
  import pandas as pd
6
 
7
  # pylint: disable=wrong-import-position
 
9
  sys.path.append(root_dir)
10
  from graphgen.models import Tokenizer
11
 
12
+
13
  def count_tokens(file, tokenizer_name, data_frame):
14
  if not file or not os.path.exists(file):
15
  return data_frame
16
 
17
  if file.endswith(".jsonl"):
18
+ with open(file, "r", encoding="utf-8") as f:
19
  data = [json.loads(line) for line in f]
20
  elif file.endswith(".json"):
21
+ with open(file, "r", encoding="utf-8") as f:
22
  data = json.load(f)
23
  data = [item for sublist in data for item in sublist]
24
  elif file.endswith(".txt"):
25
+ with open(file, "r", encoding="utf-8") as f:
26
  data = f.read()
27
+ chunks = [data[i : i + 512] for i in range(0, len(data), 512)]
 
 
28
  data = [{"content": chunk} for chunk in chunks]
29
+ elif file.endswith(".csv"):
30
+ df = pd.read_csv(file)
31
+ if "content" in df.columns:
32
+ data = df["content"].tolist()
33
+ else:
34
+ data = df.iloc[:, 0].tolist()
35
  else:
36
  raise ValueError(f"Unsupported file type: {file}")
37
 
 
47
  content = item
48
  token_count += len(tokenizer.encode_string(content))
49
 
50
+ _update_data = [[str(token_count), str(token_count * 50), "N/A"]]
 
 
 
 
51
 
52
  try:
53
+ new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
 
 
 
54
  data_frame = new_df
55
 
56
+ except Exception as e: # pylint: disable=broad-except
57
  print("[ERROR] DataFrame操作异常:", str(e))
58
 
59
  return data_frame
webui/examples/chunked_demo.json DELETED
@@ -1,14 +0,0 @@
1
- [
2
- [
3
- {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}
4
- ],
5
- [
6
- {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}
7
- ],
8
- [
9
- {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}
10
- ],
11
- [
12
- {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
13
- ]
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webui/examples/csv_demo.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ content
2
+ "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"
3
+ "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"
4
+ "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."
5
+ "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."
webui/examples/json_demo.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [
2
+ {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
3
+ {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"},
4
+ {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
5
+ {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
6
+ ]
webui/examples/{raw_demo.jsonl → jsonl_demo.jsonl} RENAMED
File without changes