npc0 commited on
Commit
422fa56
·
verified ·
1 Parent(s): 8bb0452

Update settings.yaml

Browse files
Files changed (1) hide show
  1. settings.yaml +110 -5
settings.yaml CHANGED
@@ -1,9 +1,11 @@
 
 
1
  llm:
2
  api_key: "3bf18984-b4df-49ba-a30b-6cbae3964b08"
3
  type: openai_chat
4
  model_supports_json: true
5
- model: "claude-3-5-sonnet-20240620"
6
- api_base: "http://localhost:8000/v1"
7
  # max_tokens: 10000 # Adjusted based on Claude 3 Haiku's typical context window
8
  request_timeout: 30
9
  tokens_per_minute: 100000
@@ -11,9 +13,112 @@ llm:
11
  max_retry_wait: 5
12
  temperature: 0.1
13
 
14
- embeddings:
15
  async_mode: threaded
16
  llm:
 
17
  type: openai_embedding
18
- model: "mixedbread-ai/mxbai-embed-large-v1"
19
- api_base: "http://localhost:7997"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoding_model: cl100k_base
2
+ skip_workflows: []
3
  llm:
4
  api_key: "3bf18984-b4df-49ba-a30b-6cbae3964b08"
5
  type: openai_chat
6
  model_supports_json: true
7
+ model: claude-3-5-sonnet-20240620
8
+ api_base: http://localhost:8000/v1
9
  # max_tokens: 10000 # Adjusted based on Claude 3 Haiku's typical context window
10
  request_timeout: 30
11
  tokens_per_minute: 100000
 
13
  max_retry_wait: 5
14
  temperature: 0.1
15
 
16
+ embeddings:
17
  async_mode: threaded
18
  llm:
19
+ api_key: "EMBEDDING_API_KEY"
20
  type: openai_embedding
21
+ model: mixedbread-ai/mxbai-embed-large-v1
22
+ api_base: http://localhost:7997
23
+
24
+ chunks:
25
+ size: 1200
26
+ overlap: 100
27
+ group_by_columns: [id] # by default, we don't allow chunks to cross documents
28
+
29
+ input:
30
+ type: file # or blob
31
+ file_type: text # or csv
32
+ base_dir: "input"
33
+ file_encoding: utf-8
34
+ file_pattern: ".*\\.txt$"
35
+
36
+ cache:
37
+ type: file # or blob
38
+ base_dir: "cache"
39
+ # connection_string: <azure_blob_storage_connection_string>
40
+ # container_name: <azure_blob_storage_container_name>
41
+
42
+ storage:
43
+ type: file # or blob
44
+ base_dir: "output/${timestamp}/artifacts"
45
+ # connection_string: <azure_blob_storage_connection_string>
46
+ # container_name: <azure_blob_storage_container_name>
47
+
48
+ reporting:
49
+ type: file # or console, blob
50
+ base_dir: "output/${timestamp}/reports"
51
+ # connection_string: <azure_blob_storage_connection_string>
52
+ # container_name: <azure_blob_storage_container_name>
53
+
54
+ entity_extraction:
55
+ ## llm: override the global llm settings for this task
56
+ ## parallelization: override the global parallelization settings for this task
57
+ ## async_mode: override the global async_mode settings for this task
58
+ prompt: "prompts/entity_extraction.txt"
59
+ entity_types: [organization,person,geo,event]
60
+ max_gleanings: 1
61
+
62
+ summarize_descriptions:
63
+ ## llm: override the global llm settings for this task
64
+ ## parallelization: override the global parallelization settings for this task
65
+ ## async_mode: override the global async_mode settings for this task
66
+ prompt: "prompts/summarize_descriptions.txt"
67
+ max_length: 500
68
+
69
+ claim_extraction:
70
+ ## llm: override the global llm settings for this task
71
+ ## parallelization: override the global parallelization settings for this task
72
+ ## async_mode: override the global async_mode settings for this task
73
+ # enabled: true
74
+ prompt: "prompts/claim_extraction.txt"
75
+ description: "Any claims or facts that could be relevant to information discovery."
76
+ max_gleanings: 1
77
+
78
+ community_reports:
79
+ ## llm: override the global llm settings for this task
80
+ ## parallelization: override the global parallelization settings for this task
81
+ ## async_mode: override the global async_mode settings for this task
82
+ prompt: "prompts/community_report.txt"
83
+ max_length: 2000
84
+ max_input_length: 8000
85
+
86
+ cluster_graph:
87
+ max_cluster_size: 10
88
+
89
+ embed_graph:
90
+ enabled: false # if true, will generate node2vec embeddings for nodes
91
+ # num_walks: 10
92
+ # walk_length: 40
93
+ # window_size: 2
94
+ # iterations: 3
95
+ # random_seed: 597832
96
+
97
+ umap:
98
+ enabled: false # if true, will generate UMAP embeddings for nodes
99
+
100
+ snapshots:
101
+ graphml: false
102
+ raw_entities: false
103
+ top_level_nodes: false
104
+
105
+ local_search:
106
+ # text_unit_prop: 0.5
107
+ # community_prop: 0.1
108
+ # conversation_history_max_turns: 5
109
+ # top_k_mapped_entities: 10
110
+ # top_k_relationships: 10
111
+ # llm_temperature: 0 # temperature for sampling
112
+ # llm_top_p: 1 # top-p sampling
113
+ # llm_n: 1 # Number of completions to generate
114
+ # max_tokens: 12000
115
+
116
+ global_search:
117
+ # llm_temperature: 0 # temperature for sampling
118
+ # llm_top_p: 1 # top-p sampling
119
+ # llm_n: 1 # Number of completions to generate
120
+ # max_tokens: 12000
121
+ # data_max_tokens: 12000
122
+ # map_max_tokens: 1000
123
+ # reduce_max_tokens: 2000
124
+ # concurrency: 32