Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

synthetic-data-generator / .env.template

daqc

Add .env.template

9bc9bf6 2 months ago

raw

history blame

3.65 kB

	# =============================================================================
	# REQUIRED CONFIGURATION
	# =============================================================================
	# Hugging Face token with read/write permissions for repositories and inference API
	# Get it from: https://huggingface.co/settings/tokens
	HF_TOKEN=hg_...

	# -----------------------------------------------------------------------------
	# GENERATION SETTINGS
	# -----------------------------------------------------------------------------
	MAX_NUM_TOKENS=2048
	MAX_NUM_ROWS=1000
	DEFAULT_BATCH_SIZE=5

	# Required for chat data generation with Llama or Qwen models
	# Options: "llama3", "qwen2", or custom template string
	#MAGPIE_PRE_QUERY_TEMPLATE=qwen2



	# =============================================================================
	# MODEL & SERVICES CONFIGURATION
	# =============================================================================

	# -----------------------------------------------------------------------------
	# A. STANDALONE SETUP (No additional installation required)
	# -----------------------------------------------------------------------------

	# 1. HUGGING FACE SERVERLESS (Recommended default)
	# Just requires HF_TOKEN
	# MODEL=meta-llama/Llama-3.1-8B-Instruct
	# MODEL=Qwen/Qwen2.5-1.5B-Instruct

	# 2. ARGILLA ON HUGGING FACE SPACES (Recommended for data annotation)
	# ARGILLA_API_URL=https://daqc-my-argilla.hf.space/
	#ARGILLA_API_KEY=

	# 3. OPENAI API
	# Requires OpenAI API key
	# OPENAI_BASE_URL=https://api.openai.com/v1/
	# MODEL=gpt-4
	# API_KEY=

	# -----------------------------------------------------------------------------
	# B. LOCAL SETUP (Requires local installation)
	# -----------------------------------------------------------------------------

	# 1. LOCAL OLLAMA
	# Requires: Ollama installed (https://ollama.ai)
	#OLLAMA_BASE_URL=http://127.0.0.1:11434/
	#MODEL=qwen2.5:32b-instruct-q5_K_S
	#TOKENIZER_ID=Qwen/Qwen2.5-32B-Instruct

	# MODEL=deepseek-r1:1.5b
	# TOKENIZER_ID=deepseek-r1:1.5b



	# 2. LOCAL VLLM
	# Requires: VLLM installed
	# VLLM_BASE_URL=http://127.0.0.1:8000/
	# MODEL=Qwen/Qwen2.5-1.5B-Instruct
	# TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct

	# 3. LOCAL TGI/ENDPOINTS
	# Requires: Text Generation Inference installed
	# HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
	# TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct


	# -----------------------------------------------------------------------------
	# C. DOCKER SETUP (Ready to use with docker-compose, recommended for full setup)
	# -----------------------------------------------------------------------------

	# 1. DOCKER OLLAMA
	OLLAMA_BASE_URL=http://ollama:11434
	# Options for OLLAMA_HARDWARE: latest (for CPU/NVIDIA), rocm (for AMD)
	OLLAMA_HARDWARE=latest


	# DEEPSEEK R1
	#MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
	#TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
	#MAGPIE_PRE_QUERY_TEMPLATE= "<｜begin▁of▁sentence｜>User: " # use the custom template for the model

	#LLAMA3.2
	MODEL=llama3.2:1b # model for instruction generation
	TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct # tokenizer for instruction generation
	MAGPIE_PRE_QUERY_TEMPLATE=llama3 # magpie template required for instruction generation


	# 2. DOCKER ARGILLA (persistent data)
	ARGILLA_API_URL=http://argilla:6900
	ARGILLA_USERNAME=admin
	ARGILLA_PASSWORD=admin1234
	ARGILLA_API_KEY=admin.1234
	ARGILLA_REINDEX_DATASET=1

	# Usage:
	#docker-compose --profile with-ollama --profile with-argilla build
	#(open new terminal) docker-compose --profile with-ollama up -d
	# docker-compose exec ollama ollama run llama3.2:1b
	#docker-compose --profile with-ollama --profile with-argilla up -d