supervised_finetuning_quiz

Runtime error

supervised_finetuning_quiz / data /supervised-finetuning.json

burtenshaw

update application with latest logic from parent

c0d911e 5 months ago

15.8 kB

	[
	{
	"question": "What is Supervised Fine-Tuning (SFT) in the context of LLMs?",
	"answer_a": "A technique to make models run faster",
	"answer_b": "A method to train models on specific tasks using labeled data",
	"answer_c": "A way to reduce model size",
	"answer_d": "A process to create new model architectures",
	"correct_answer": "B"
	},
	{
	"question": "What is LoRA (Low-Rank Adaptation)?",
	"answer_a": "A new type of transformer architecture",
	"answer_b": "A method to compress models after training",
	"answer_c": "An efficient fine-tuning technique that updates a small number of trainable parameters",
	"answer_d": "A data preprocessing technique",
	"correct_answer": "C"
	},
	{
	"question": "What is the main advantage of using LoRA for fine-tuning?",
	"answer_a": "It makes models more accurate than full fine-tuning",
	"answer_b": "It reduces memory requirements and training costs while maintaining performance",
	"answer_c": "It allows training without any original model weights",
	"answer_d": "It automatically improves model inference speed",
	"correct_answer": "B"
	},
	{
	"question": "In chat templates, what is the purpose of the 'system' message?",
	"answer_a": "To log system errors",
	"answer_b": "To define the behavior and role of the assistant",
	"answer_c": "To store user preferences",
	"answer_d": "To handle technical configurations",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is a common format marker in chat templates?",
	"answer_a": "<\|im_start\|>",
	"answer_b": "{BEGIN}",
	"answer_c": "START_CHAT",
	"answer_d": "<<START>>",
	"correct_answer": "A"
	},
	{
	"question": "What is the primary purpose of SFT datasets?",
	"answer_a": "To create new model architectures",
	"answer_b": "To train models to follow specific instructions and generate desired outputs",
	"answer_c": "To test model performance only",
	"answer_d": "To compress model size",
	"correct_answer": "B"
	},
	{
	"question": "Which statement about LoRA is TRUE?",
	"answer_a": "It requires modifying the original model architecture",
	"answer_b": "It can only be used with small models",
	"answer_c": "It adds low-rank matrices to existing weights during training",
	"answer_d": "It permanently changes the base model weights",
	"correct_answer": "C"
	},
	{
	"question": "What is a key benefit of using standardized chat templates?",
	"answer_a": "They make models run faster",
	"answer_b": "They ensure consistent formatting across different model implementations",
	"answer_c": "They reduce model size",
	"answer_d": "They eliminate the need for tokenization",
	"correct_answer": "B"
	},
	{
	"question": "In the context of SFT, what is a 'prompt-completion' pair?",
	"answer_a": "Two separate models working together",
	"answer_b": "A training example consisting of an input and its desired output",
	"answer_c": "A type of model architecture",
	"answer_d": "A method to compress training data",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is NOT a typical component of a chat template?",
	"answer_a": "System message",
	"answer_b": "User message",
	"answer_c": "Assistant message",
	"answer_d": "Database message",
	"correct_answer": "D"
	},
	{
	"question": "What is the purpose of the SFTTrainer in the TRL library?",
	"answer_a": "To train models from scratch",
	"answer_b": "To simplify the process of fine-tuning language models on instruction datasets",
	"answer_c": "To evaluate model performance",
	"answer_d": "To compress model weights",
	"correct_answer": "B"
	},
	{
	"question": "What is a key characteristic of LoRA's training approach?",
	"answer_a": "It trains all model parameters",
	"answer_b": "It only works with small models",
	"answer_c": "It freezes the original model weights and injects trainable rank decomposition matrices",
	"answer_d": "It requires multiple GPUs",
	"correct_answer": "C"
	},
	{
	"question": "Which parameter in LoRA determines the size of the rank decomposition matrices?",
	"answer_a": "lora_alpha",
	"answer_b": "r",
	"answer_c": "dropout",
	"answer_d": "bias",
	"correct_answer": "B"
	},
	{
	"question": "What is the role of 'target_modules' in LoRA configuration?",
	"answer_a": "To specify which layers to remove",
	"answer_b": "To define which layers will be adapted with LoRA",
	"answer_c": "To set the learning rate for each layer",
	"answer_d": "To determine model output",
	"correct_answer": "B"
	},
	{
	"question": "What is the purpose of chat template's 'add_generation_prompt' parameter?",
	"answer_a": "To end the conversation",
	"answer_b": "To add a prompt for the model to continue generating",
	"answer_c": "To change the system message",
	"answer_d": "To modify user input",
	"correct_answer": "B"
	},
	{
	"question": "In SFT training, what is gradient checkpointing used for?",
	"answer_a": "To save training progress",
	"answer_b": "To reduce memory usage during training",
	"answer_c": "To increase model accuracy",
	"answer_d": "To speed up training",
	"correct_answer": "B"
	},
	{
	"question": "What is the purpose of the 'lora_alpha' parameter in LoRA?",
	"answer_a": "To set the learning rate",
	"answer_b": "To scale the LoRA weights during inference",
	"answer_c": "To determine batch size",
	"answer_d": "To control model size",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is a benefit of using the SFTTrainer?",
	"answer_a": "It automatically handles padding and truncation of inputs",
	"answer_b": "It creates new model architectures",
	"answer_c": "It performs unsupervised learning",
	"answer_d": "It generates training data",
	"correct_answer": "A"
	},
	{
	"question": "What is the purpose of 'formatting_func' in SFTTrainer?",
	"answer_a": "To format the output text",
	"answer_b": "To preprocess and structure the training data",
	"answer_c": "To modify model architecture",
	"answer_d": "To handle error messages",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is TRUE about LoRA training?",
	"answer_a": "It requires more memory than full fine-tuning",
	"answer_b": "It can only be used with specific model architectures",
	"answer_c": "It allows efficient adaptation while keeping original weights frozen",
	"answer_d": "It always produces better results than full fine-tuning",
	"correct_answer": "C"
	},
	{
	"question": "What is the purpose of 'max_seq_length' in SFTTrainer?",
	"answer_a": "To limit the model's vocabulary size",
	"answer_b": "To set the maximum length of input sequences",
	"answer_c": "To determine the batch size",
	"answer_d": "To control the learning rate",
	"correct_answer": "B"
	},
	{
	"question": "In chat templates, what is the purpose of conversation history?",
	"answer_a": "To store user preferences",
	"answer_b": "To maintain context across multiple turns of dialogue",
	"answer_c": "To track error messages",
	"answer_d": "To count tokens",
	"correct_answer": "B"
	},
	{
	"question": "What is a key advantage of using BitsAndBytes for SFT?",
	"answer_a": "It makes training faster",
	"answer_b": "It reduces memory usage through quantization",
	"answer_c": "It improves model accuracy",
	"answer_d": "It simplifies the code",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is NOT a typical parameter in LoRA configuration?",
	"answer_a": "r",
	"answer_b": "lora_alpha",
	"answer_c": "model_size",
	"answer_d": "target_modules",
	"correct_answer": "C"
	},
	{
	"question": "What is the purpose of 'warmup_ratio' in training arguments?",
	"answer_a": "To set the final learning rate",
	"answer_b": "To determine the portion of training used for learning rate warmup",
	"answer_c": "To control model temperature",
	"answer_d": "To set the batch size",
	"correct_answer": "B"
	},
	{
	"question": "Which statement about SFT datasets is TRUE?",
	"answer_a": "They must always be in JSON format",
	"answer_b": "They typically contain input-output pairs for training",
	"answer_c": "They can only contain single-turn conversations",
	"answer_d": "They must include system prompts",
	"correct_answer": "B"
	},
	{
	"question": "What is the role of 'gradient_accumulation_steps' in training?",
	"answer_a": "To speed up training",
	"answer_b": "To simulate larger batch sizes with limited memory",
	"answer_c": "To reduce model size",
	"answer_d": "To improve accuracy",
	"correct_answer": "B"
	},
	{
	"question": "Which of these is a common use case for LoRA?",
	"answer_a": "Creating new model architectures",
	"answer_b": "Adapting large models to specific tasks efficiently",
	"answer_c": "Reducing model inference time",
	"answer_d": "Generating training data",
	"correct_answer": "B"
	},
	{
	"question": "What is the purpose of 'save_total_limit' in training arguments?",
	"answer_a": "To limit the model's vocabulary",
	"answer_b": "To control how many checkpoints are saved during training",
	"answer_c": "To set the maximum sequence length",
	"answer_d": "To limit training time",
	"correct_answer": "B"
	},
	{
	"question": "Which optimization technique is commonly used with LoRA?",
	"answer_a": "SGD",
	"answer_b": "AdamW",
	"answer_c": "RMSprop",
	"answer_d": "Momentum",
	"correct_answer": "B"
	},
	{
	"question": "What is the most significant difference between full fine-tuning and LoRA?",
	"answer_a": "LoRA updates a subset of model weights while full fine-tuning updates all weights",
	"answer_b": "LoRA adds new parameters while keeping original weights frozen",
	"answer_c": "LoRA modifies attention layers while full fine-tuning modifies feed-forward layers",
	"answer_d": "LoRA trains faster but requires more memory than full fine-tuning",
	"correct_answer": "B"
	},
	{
	"question": "When implementing chat templates, which approach is most likely to maintain model performance?",
	"answer_a": "Using the exact template format from the model's training data",
	"answer_b": "Using a simplified template with just role and content",
	"answer_c": "Using a standardized template across all models",
	"answer_d": "Using a template with additional control tokens",
	"correct_answer": "A"
	},
	{
	"question": "What is the key technical innovation of LoRA's rank decomposition approach?",
	"answer_a": "It reduces model parameters through matrix factorization",
	"answer_b": "It decomposes weight updates into low-rank matrices while preserving model capacity",
	"answer_c": "It compresses the model weights using SVD decomposition",
	"answer_d": "It optimizes attention mechanisms through rank reduction",
	"correct_answer": "B"
	},
	{
	"question": "How does the 'r' parameter in LoRA affect the training process?",
	"answer_a": "Higher r increases model capacity but requires more memory",
	"answer_b": "Lower r reduces training time but may impact performance",
	"answer_c": "Higher r improves convergence but increases computation",
	"answer_d": "Lower r decreases memory usage but may limit expressiveness",
	"correct_answer": "D"
	},
	{
	"question": "What is the primary consideration when choosing target_modules for LoRA?",
	"answer_a": "Selecting layers that most influence task-specific behavior",
	"answer_b": "Targeting modules with the most parameters",
	"answer_c": "Choosing layers closest to the model output",
	"answer_d": "Selecting modules with the least impact on inference speed",
	"correct_answer": "A"
	},
	{
	"question": "How does gradient checkpointing affect the training process in SFT?",
	"answer_a": "Trades computation time for reduced memory usage",
	"answer_b": "Reduces memory by storing fewer activation gradients",
	"answer_c": "Improves training stability through gradient accumulation",
	"answer_d": "Optimizes memory by recomputing forward passes",
	"correct_answer": "A"
	},
	{
	"question": "What role does lora_alpha play in the training dynamics?",
	"answer_a": "Controls the learning rate scaling of LoRA updates",
	"answer_b": "Scales the contribution of LoRA weights during inference",
	"answer_c": "Determines the initialization range of LoRA matrices",
	"answer_d": "Adjusts the gradient flow through LoRA layers",
	"correct_answer": "B"
	},
	{
	"question": "Which aspect of SFT datasets most influences training effectiveness?",
	"answer_a": "The diversity of instruction-output pairs",
	"answer_b": "The total number of training examples",
	"answer_c": "The complexity of individual instructions",
	"answer_d": "The length of output sequences",
	"correct_answer": "A"
	},
	{
	"question": "How does warmup_ratio impact the training dynamics?",
	"answer_a": "Prevents early overfitting by gradually increasing learning rate",
	"answer_b": "Stabilizes initial training by ramping up learning rate",
	"answer_c": "Reduces gradient variance in early training steps",
	"answer_d": "Improves model convergence through learning rate scheduling",
	"correct_answer": "B"
	},
	{
	"question": "What is the primary challenge addressed by gradient_accumulation_steps?",
	"answer_a": "Memory constraints limiting batch size",
	"answer_b": "Training instability with large learning rates",
	"answer_c": "Slow convergence with small batches",
	"answer_d": "Gradient vanishing in deep networks",
	"correct_answer": "A"
	},
	{
	"question": "How does BitsAndBytes quantization affect SFT training?",
	"answer_a": "Reduces precision while maintaining training stability",
	"answer_b": "Compresses weights with minimal performance impact",
	"answer_c": "Optimizes memory usage through dynamic quantization",
	"answer_d": "Balances precision and memory requirements",
	"correct_answer": "D"
	},
	{
	"question": "What distinguishes an effective chat template implementation?",
	"answer_a": "Minimal special token usage with clear role separation",
	"answer_b": "Consistent formatting with explicit turn boundaries",
	"answer_c": "Efficient token usage while maintaining context",
	"answer_d": "Flexible role definition with standardized markers",
	"correct_answer": "C"
	}
	]