Spaces:

Claimant
/

POMS-QA-GraphRAG

Sleeping

App Files Files Community

POMS-QA-GraphRAG / settings.yaml

npc0

Update settings.yaml

422fa56 verified 8 months ago

raw

history blame

3.73 kB

	encoding_model: cl100k_base
	skip_workflows: []
	llm:
	api_key: "3bf18984-b4df-49ba-a30b-6cbae3964b08"
	type: openai_chat
	model_supports_json: true
	model: claude-3-5-sonnet-20240620
	api_base: http://localhost:8000/v1
	# max_tokens: 10000 # Adjusted based on Claude 3 Haiku's typical context window
	request_timeout: 30
	tokens_per_minute: 100000
	requests_per_minute: 1000
	max_retry_wait: 5
	temperature: 0.1

	embeddings:
	async_mode: threaded
	llm:
	api_key: "EMBEDDING_API_KEY"
	type: openai_embedding
	model: mixedbread-ai/mxbai-embed-large-v1
	api_base: http://localhost:7997

	chunks:
	size: 1200
	overlap: 100
	group_by_columns: [id] # by default, we don't allow chunks to cross documents

	input:
	type: file # or blob
	file_type: text # or csv
	base_dir: "input"
	file_encoding: utf-8
	file_pattern: ".*\\.txt$"

	cache:
	type: file # or blob
	base_dir: "cache"
	# connection_string: <azure_blob_storage_connection_string>
	# container_name: <azure_blob_storage_container_name>

	storage:
	type: file # or blob
	base_dir: "output/${timestamp}/artifacts"
	# connection_string: <azure_blob_storage_connection_string>
	# container_name: <azure_blob_storage_container_name>

	reporting:
	type: file # or console, blob
	base_dir: "output/${timestamp}/reports"
	# connection_string: <azure_blob_storage_connection_string>
	# container_name: <azure_blob_storage_container_name>

	entity_extraction:
	## llm: override the global llm settings for this task
	## parallelization: override the global parallelization settings for this task
	## async_mode: override the global async_mode settings for this task
	prompt: "prompts/entity_extraction.txt"
	entity_types: [organization,person,geo,event]
	max_gleanings: 1

	summarize_descriptions:
	## llm: override the global llm settings for this task
	## parallelization: override the global parallelization settings for this task
	## async_mode: override the global async_mode settings for this task
	prompt: "prompts/summarize_descriptions.txt"
	max_length: 500

	claim_extraction:
	## llm: override the global llm settings for this task
	## parallelization: override the global parallelization settings for this task
	## async_mode: override the global async_mode settings for this task
	# enabled: true
	prompt: "prompts/claim_extraction.txt"
	description: "Any claims or facts that could be relevant to information discovery."
	max_gleanings: 1

	community_reports:
	## llm: override the global llm settings for this task
	## parallelization: override the global parallelization settings for this task
	## async_mode: override the global async_mode settings for this task
	prompt: "prompts/community_report.txt"
	max_length: 2000
	max_input_length: 8000

	cluster_graph:
	max_cluster_size: 10

	embed_graph:
	enabled: false # if true, will generate node2vec embeddings for nodes
	# num_walks: 10
	# walk_length: 40
	# window_size: 2
	# iterations: 3
	# random_seed: 597832

	umap:
	enabled: false # if true, will generate UMAP embeddings for nodes

	snapshots:
	graphml: false
	raw_entities: false
	top_level_nodes: false

	local_search:
	# text_unit_prop: 0.5
	# community_prop: 0.1
	# conversation_history_max_turns: 5
	# top_k_mapped_entities: 10
	# top_k_relationships: 10
	# llm_temperature: 0 # temperature for sampling
	# llm_top_p: 1 # top-p sampling
	# llm_n: 1 # Number of completions to generate
	# max_tokens: 12000

	global_search:
	# llm_temperature: 0 # temperature for sampling
	# llm_top_p: 1 # top-p sampling
	# llm_n: 1 # Number of completions to generate
	# max_tokens: 12000
	# data_max_tokens: 12000
	# map_max_tokens: 1000
	# reduce_max_tokens: 2000
	# concurrency: 32