supervised_finetuning_quiz / data /supervised-finetuning.json
burtenshaw
update application with latest logic from parent
c0d911e
[
{
"question": "What is Supervised Fine-Tuning (SFT) in the context of LLMs?",
"answer_a": "A technique to make models run faster",
"answer_b": "A method to train models on specific tasks using labeled data",
"answer_c": "A way to reduce model size",
"answer_d": "A process to create new model architectures",
"correct_answer": "B"
},
{
"question": "What is LoRA (Low-Rank Adaptation)?",
"answer_a": "A new type of transformer architecture",
"answer_b": "A method to compress models after training",
"answer_c": "An efficient fine-tuning technique that updates a small number of trainable parameters",
"answer_d": "A data preprocessing technique",
"correct_answer": "C"
},
{
"question": "What is the main advantage of using LoRA for fine-tuning?",
"answer_a": "It makes models more accurate than full fine-tuning",
"answer_b": "It reduces memory requirements and training costs while maintaining performance",
"answer_c": "It allows training without any original model weights",
"answer_d": "It automatically improves model inference speed",
"correct_answer": "B"
},
{
"question": "In chat templates, what is the purpose of the 'system' message?",
"answer_a": "To log system errors",
"answer_b": "To define the behavior and role of the assistant",
"answer_c": "To store user preferences",
"answer_d": "To handle technical configurations",
"correct_answer": "B"
},
{
"question": "Which of these is a common format marker in chat templates?",
"answer_a": "<|im_start|>",
"answer_b": "{BEGIN}",
"answer_c": "START_CHAT",
"answer_d": "<<START>>",
"correct_answer": "A"
},
{
"question": "What is the primary purpose of SFT datasets?",
"answer_a": "To create new model architectures",
"answer_b": "To train models to follow specific instructions and generate desired outputs",
"answer_c": "To test model performance only",
"answer_d": "To compress model size",
"correct_answer": "B"
},
{
"question": "Which statement about LoRA is TRUE?",
"answer_a": "It requires modifying the original model architecture",
"answer_b": "It can only be used with small models",
"answer_c": "It adds low-rank matrices to existing weights during training",
"answer_d": "It permanently changes the base model weights",
"correct_answer": "C"
},
{
"question": "What is a key benefit of using standardized chat templates?",
"answer_a": "They make models run faster",
"answer_b": "They ensure consistent formatting across different model implementations",
"answer_c": "They reduce model size",
"answer_d": "They eliminate the need for tokenization",
"correct_answer": "B"
},
{
"question": "In the context of SFT, what is a 'prompt-completion' pair?",
"answer_a": "Two separate models working together",
"answer_b": "A training example consisting of an input and its desired output",
"answer_c": "A type of model architecture",
"answer_d": "A method to compress training data",
"correct_answer": "B"
},
{
"question": "Which of these is NOT a typical component of a chat template?",
"answer_a": "System message",
"answer_b": "User message",
"answer_c": "Assistant message",
"answer_d": "Database message",
"correct_answer": "D"
},
{
"question": "What is the purpose of the SFTTrainer in the TRL library?",
"answer_a": "To train models from scratch",
"answer_b": "To simplify the process of fine-tuning language models on instruction datasets",
"answer_c": "To evaluate model performance",
"answer_d": "To compress model weights",
"correct_answer": "B"
},
{
"question": "What is a key characteristic of LoRA's training approach?",
"answer_a": "It trains all model parameters",
"answer_b": "It only works with small models",
"answer_c": "It freezes the original model weights and injects trainable rank decomposition matrices",
"answer_d": "It requires multiple GPUs",
"correct_answer": "C"
},
{
"question": "Which parameter in LoRA determines the size of the rank decomposition matrices?",
"answer_a": "lora_alpha",
"answer_b": "r",
"answer_c": "dropout",
"answer_d": "bias",
"correct_answer": "B"
},
{
"question": "What is the role of 'target_modules' in LoRA configuration?",
"answer_a": "To specify which layers to remove",
"answer_b": "To define which layers will be adapted with LoRA",
"answer_c": "To set the learning rate for each layer",
"answer_d": "To determine model output",
"correct_answer": "B"
},
{
"question": "What is the purpose of chat template's 'add_generation_prompt' parameter?",
"answer_a": "To end the conversation",
"answer_b": "To add a prompt for the model to continue generating",
"answer_c": "To change the system message",
"answer_d": "To modify user input",
"correct_answer": "B"
},
{
"question": "In SFT training, what is gradient checkpointing used for?",
"answer_a": "To save training progress",
"answer_b": "To reduce memory usage during training",
"answer_c": "To increase model accuracy",
"answer_d": "To speed up training",
"correct_answer": "B"
},
{
"question": "What is the purpose of the 'lora_alpha' parameter in LoRA?",
"answer_a": "To set the learning rate",
"answer_b": "To scale the LoRA weights during inference",
"answer_c": "To determine batch size",
"answer_d": "To control model size",
"correct_answer": "B"
},
{
"question": "Which of these is a benefit of using the SFTTrainer?",
"answer_a": "It automatically handles padding and truncation of inputs",
"answer_b": "It creates new model architectures",
"answer_c": "It performs unsupervised learning",
"answer_d": "It generates training data",
"correct_answer": "A"
},
{
"question": "What is the purpose of 'formatting_func' in SFTTrainer?",
"answer_a": "To format the output text",
"answer_b": "To preprocess and structure the training data",
"answer_c": "To modify model architecture",
"answer_d": "To handle error messages",
"correct_answer": "B"
},
{
"question": "Which of these is TRUE about LoRA training?",
"answer_a": "It requires more memory than full fine-tuning",
"answer_b": "It can only be used with specific model architectures",
"answer_c": "It allows efficient adaptation while keeping original weights frozen",
"answer_d": "It always produces better results than full fine-tuning",
"correct_answer": "C"
},
{
"question": "What is the purpose of 'max_seq_length' in SFTTrainer?",
"answer_a": "To limit the model's vocabulary size",
"answer_b": "To set the maximum length of input sequences",
"answer_c": "To determine the batch size",
"answer_d": "To control the learning rate",
"correct_answer": "B"
},
{
"question": "In chat templates, what is the purpose of conversation history?",
"answer_a": "To store user preferences",
"answer_b": "To maintain context across multiple turns of dialogue",
"answer_c": "To track error messages",
"answer_d": "To count tokens",
"correct_answer": "B"
},
{
"question": "What is a key advantage of using BitsAndBytes for SFT?",
"answer_a": "It makes training faster",
"answer_b": "It reduces memory usage through quantization",
"answer_c": "It improves model accuracy",
"answer_d": "It simplifies the code",
"correct_answer": "B"
},
{
"question": "Which of these is NOT a typical parameter in LoRA configuration?",
"answer_a": "r",
"answer_b": "lora_alpha",
"answer_c": "model_size",
"answer_d": "target_modules",
"correct_answer": "C"
},
{
"question": "What is the purpose of 'warmup_ratio' in training arguments?",
"answer_a": "To set the final learning rate",
"answer_b": "To determine the portion of training used for learning rate warmup",
"answer_c": "To control model temperature",
"answer_d": "To set the batch size",
"correct_answer": "B"
},
{
"question": "Which statement about SFT datasets is TRUE?",
"answer_a": "They must always be in JSON format",
"answer_b": "They typically contain input-output pairs for training",
"answer_c": "They can only contain single-turn conversations",
"answer_d": "They must include system prompts",
"correct_answer": "B"
},
{
"question": "What is the role of 'gradient_accumulation_steps' in training?",
"answer_a": "To speed up training",
"answer_b": "To simulate larger batch sizes with limited memory",
"answer_c": "To reduce model size",
"answer_d": "To improve accuracy",
"correct_answer": "B"
},
{
"question": "Which of these is a common use case for LoRA?",
"answer_a": "Creating new model architectures",
"answer_b": "Adapting large models to specific tasks efficiently",
"answer_c": "Reducing model inference time",
"answer_d": "Generating training data",
"correct_answer": "B"
},
{
"question": "What is the purpose of 'save_total_limit' in training arguments?",
"answer_a": "To limit the model's vocabulary",
"answer_b": "To control how many checkpoints are saved during training",
"answer_c": "To set the maximum sequence length",
"answer_d": "To limit training time",
"correct_answer": "B"
},
{
"question": "Which optimization technique is commonly used with LoRA?",
"answer_a": "SGD",
"answer_b": "AdamW",
"answer_c": "RMSprop",
"answer_d": "Momentum",
"correct_answer": "B"
},
{
"question": "What is the most significant difference between full fine-tuning and LoRA?",
"answer_a": "LoRA updates a subset of model weights while full fine-tuning updates all weights",
"answer_b": "LoRA adds new parameters while keeping original weights frozen",
"answer_c": "LoRA modifies attention layers while full fine-tuning modifies feed-forward layers",
"answer_d": "LoRA trains faster but requires more memory than full fine-tuning",
"correct_answer": "B"
},
{
"question": "When implementing chat templates, which approach is most likely to maintain model performance?",
"answer_a": "Using the exact template format from the model's training data",
"answer_b": "Using a simplified template with just role and content",
"answer_c": "Using a standardized template across all models",
"answer_d": "Using a template with additional control tokens",
"correct_answer": "A"
},
{
"question": "What is the key technical innovation of LoRA's rank decomposition approach?",
"answer_a": "It reduces model parameters through matrix factorization",
"answer_b": "It decomposes weight updates into low-rank matrices while preserving model capacity",
"answer_c": "It compresses the model weights using SVD decomposition",
"answer_d": "It optimizes attention mechanisms through rank reduction",
"correct_answer": "B"
},
{
"question": "How does the 'r' parameter in LoRA affect the training process?",
"answer_a": "Higher r increases model capacity but requires more memory",
"answer_b": "Lower r reduces training time but may impact performance",
"answer_c": "Higher r improves convergence but increases computation",
"answer_d": "Lower r decreases memory usage but may limit expressiveness",
"correct_answer": "D"
},
{
"question": "What is the primary consideration when choosing target_modules for LoRA?",
"answer_a": "Selecting layers that most influence task-specific behavior",
"answer_b": "Targeting modules with the most parameters",
"answer_c": "Choosing layers closest to the model output",
"answer_d": "Selecting modules with the least impact on inference speed",
"correct_answer": "A"
},
{
"question": "How does gradient checkpointing affect the training process in SFT?",
"answer_a": "Trades computation time for reduced memory usage",
"answer_b": "Reduces memory by storing fewer activation gradients",
"answer_c": "Improves training stability through gradient accumulation",
"answer_d": "Optimizes memory by recomputing forward passes",
"correct_answer": "A"
},
{
"question": "What role does lora_alpha play in the training dynamics?",
"answer_a": "Controls the learning rate scaling of LoRA updates",
"answer_b": "Scales the contribution of LoRA weights during inference",
"answer_c": "Determines the initialization range of LoRA matrices",
"answer_d": "Adjusts the gradient flow through LoRA layers",
"correct_answer": "B"
},
{
"question": "Which aspect of SFT datasets most influences training effectiveness?",
"answer_a": "The diversity of instruction-output pairs",
"answer_b": "The total number of training examples",
"answer_c": "The complexity of individual instructions",
"answer_d": "The length of output sequences",
"correct_answer": "A"
},
{
"question": "How does warmup_ratio impact the training dynamics?",
"answer_a": "Prevents early overfitting by gradually increasing learning rate",
"answer_b": "Stabilizes initial training by ramping up learning rate",
"answer_c": "Reduces gradient variance in early training steps",
"answer_d": "Improves model convergence through learning rate scheduling",
"correct_answer": "B"
},
{
"question": "What is the primary challenge addressed by gradient_accumulation_steps?",
"answer_a": "Memory constraints limiting batch size",
"answer_b": "Training instability with large learning rates",
"answer_c": "Slow convergence with small batches",
"answer_d": "Gradient vanishing in deep networks",
"correct_answer": "A"
},
{
"question": "How does BitsAndBytes quantization affect SFT training?",
"answer_a": "Reduces precision while maintaining training stability",
"answer_b": "Compresses weights with minimal performance impact",
"answer_c": "Optimizes memory usage through dynamic quantization",
"answer_d": "Balances precision and memory requirements",
"correct_answer": "D"
},
{
"question": "What distinguishes an effective chat template implementation?",
"answer_a": "Minimal special token usage with clear role separation",
"answer_b": "Consistent formatting with explicit turn boundaries",
"answer_c": "Efficient token usage while maintaining context",
"answer_d": "Flexible role definition with standardized markers",
"correct_answer": "C"
}
]