|
[ |
|
{ |
|
"question": "What is Supervised Fine-Tuning (SFT) in the context of LLMs?", |
|
"answer_a": "A technique to make models run faster", |
|
"answer_b": "A method to train models on specific tasks using labeled data", |
|
"answer_c": "A way to reduce model size", |
|
"answer_d": "A process to create new model architectures", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is LoRA (Low-Rank Adaptation)?", |
|
"answer_a": "A new type of transformer architecture", |
|
"answer_b": "A method to compress models after training", |
|
"answer_c": "An efficient fine-tuning technique that updates a small number of trainable parameters", |
|
"answer_d": "A data preprocessing technique", |
|
"correct_answer": "C" |
|
}, |
|
{ |
|
"question": "What is the main advantage of using LoRA for fine-tuning?", |
|
"answer_a": "It makes models more accurate than full fine-tuning", |
|
"answer_b": "It reduces memory requirements and training costs while maintaining performance", |
|
"answer_c": "It allows training without any original model weights", |
|
"answer_d": "It automatically improves model inference speed", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "In chat templates, what is the purpose of the 'system' message?", |
|
"answer_a": "To log system errors", |
|
"answer_b": "To define the behavior and role of the assistant", |
|
"answer_c": "To store user preferences", |
|
"answer_d": "To handle technical configurations", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is a common format marker in chat templates?", |
|
"answer_a": "<|im_start|>", |
|
"answer_b": "{BEGIN}", |
|
"answer_c": "START_CHAT", |
|
"answer_d": "<<START>>", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "What is the primary purpose of SFT datasets?", |
|
"answer_a": "To create new model architectures", |
|
"answer_b": "To train models to follow specific instructions and generate desired outputs", |
|
"answer_c": "To test model performance only", |
|
"answer_d": "To compress model size", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which statement about LoRA is TRUE?", |
|
"answer_a": "It requires modifying the original model architecture", |
|
"answer_b": "It can only be used with small models", |
|
"answer_c": "It adds low-rank matrices to existing weights during training", |
|
"answer_d": "It permanently changes the base model weights", |
|
"correct_answer": "C" |
|
}, |
|
{ |
|
"question": "What is a key benefit of using standardized chat templates?", |
|
"answer_a": "They make models run faster", |
|
"answer_b": "They ensure consistent formatting across different model implementations", |
|
"answer_c": "They reduce model size", |
|
"answer_d": "They eliminate the need for tokenization", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "In the context of SFT, what is a 'prompt-completion' pair?", |
|
"answer_a": "Two separate models working together", |
|
"answer_b": "A training example consisting of an input and its desired output", |
|
"answer_c": "A type of model architecture", |
|
"answer_d": "A method to compress training data", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is NOT a typical component of a chat template?", |
|
"answer_a": "System message", |
|
"answer_b": "User message", |
|
"answer_c": "Assistant message", |
|
"answer_d": "Database message", |
|
"correct_answer": "D" |
|
}, |
|
{ |
|
"question": "What is the purpose of the SFTTrainer in the TRL library?", |
|
"answer_a": "To train models from scratch", |
|
"answer_b": "To simplify the process of fine-tuning language models on instruction datasets", |
|
"answer_c": "To evaluate model performance", |
|
"answer_d": "To compress model weights", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is a key characteristic of LoRA's training approach?", |
|
"answer_a": "It trains all model parameters", |
|
"answer_b": "It only works with small models", |
|
"answer_c": "It freezes the original model weights and injects trainable rank decomposition matrices", |
|
"answer_d": "It requires multiple GPUs", |
|
"correct_answer": "C" |
|
}, |
|
{ |
|
"question": "Which parameter in LoRA determines the size of the rank decomposition matrices?", |
|
"answer_a": "lora_alpha", |
|
"answer_b": "r", |
|
"answer_c": "dropout", |
|
"answer_d": "bias", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the role of 'target_modules' in LoRA configuration?", |
|
"answer_a": "To specify which layers to remove", |
|
"answer_b": "To define which layers will be adapted with LoRA", |
|
"answer_c": "To set the learning rate for each layer", |
|
"answer_d": "To determine model output", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the purpose of chat template's 'add_generation_prompt' parameter?", |
|
"answer_a": "To end the conversation", |
|
"answer_b": "To add a prompt for the model to continue generating", |
|
"answer_c": "To change the system message", |
|
"answer_d": "To modify user input", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "In SFT training, what is gradient checkpointing used for?", |
|
"answer_a": "To save training progress", |
|
"answer_b": "To reduce memory usage during training", |
|
"answer_c": "To increase model accuracy", |
|
"answer_d": "To speed up training", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the purpose of the 'lora_alpha' parameter in LoRA?", |
|
"answer_a": "To set the learning rate", |
|
"answer_b": "To scale the LoRA weights during inference", |
|
"answer_c": "To determine batch size", |
|
"answer_d": "To control model size", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is a benefit of using the SFTTrainer?", |
|
"answer_a": "It automatically handles padding and truncation of inputs", |
|
"answer_b": "It creates new model architectures", |
|
"answer_c": "It performs unsupervised learning", |
|
"answer_d": "It generates training data", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "What is the purpose of 'formatting_func' in SFTTrainer?", |
|
"answer_a": "To format the output text", |
|
"answer_b": "To preprocess and structure the training data", |
|
"answer_c": "To modify model architecture", |
|
"answer_d": "To handle error messages", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is TRUE about LoRA training?", |
|
"answer_a": "It requires more memory than full fine-tuning", |
|
"answer_b": "It can only be used with specific model architectures", |
|
"answer_c": "It allows efficient adaptation while keeping original weights frozen", |
|
"answer_d": "It always produces better results than full fine-tuning", |
|
"correct_answer": "C" |
|
}, |
|
{ |
|
"question": "What is the purpose of 'max_seq_length' in SFTTrainer?", |
|
"answer_a": "To limit the model's vocabulary size", |
|
"answer_b": "To set the maximum length of input sequences", |
|
"answer_c": "To determine the batch size", |
|
"answer_d": "To control the learning rate", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "In chat templates, what is the purpose of conversation history?", |
|
"answer_a": "To store user preferences", |
|
"answer_b": "To maintain context across multiple turns of dialogue", |
|
"answer_c": "To track error messages", |
|
"answer_d": "To count tokens", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is a key advantage of using BitsAndBytes for SFT?", |
|
"answer_a": "It makes training faster", |
|
"answer_b": "It reduces memory usage through quantization", |
|
"answer_c": "It improves model accuracy", |
|
"answer_d": "It simplifies the code", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is NOT a typical parameter in LoRA configuration?", |
|
"answer_a": "r", |
|
"answer_b": "lora_alpha", |
|
"answer_c": "model_size", |
|
"answer_d": "target_modules", |
|
"correct_answer": "C" |
|
}, |
|
{ |
|
"question": "What is the purpose of 'warmup_ratio' in training arguments?", |
|
"answer_a": "To set the final learning rate", |
|
"answer_b": "To determine the portion of training used for learning rate warmup", |
|
"answer_c": "To control model temperature", |
|
"answer_d": "To set the batch size", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which statement about SFT datasets is TRUE?", |
|
"answer_a": "They must always be in JSON format", |
|
"answer_b": "They typically contain input-output pairs for training", |
|
"answer_c": "They can only contain single-turn conversations", |
|
"answer_d": "They must include system prompts", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the role of 'gradient_accumulation_steps' in training?", |
|
"answer_a": "To speed up training", |
|
"answer_b": "To simulate larger batch sizes with limited memory", |
|
"answer_c": "To reduce model size", |
|
"answer_d": "To improve accuracy", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which of these is a common use case for LoRA?", |
|
"answer_a": "Creating new model architectures", |
|
"answer_b": "Adapting large models to specific tasks efficiently", |
|
"answer_c": "Reducing model inference time", |
|
"answer_d": "Generating training data", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the purpose of 'save_total_limit' in training arguments?", |
|
"answer_a": "To limit the model's vocabulary", |
|
"answer_b": "To control how many checkpoints are saved during training", |
|
"answer_c": "To set the maximum sequence length", |
|
"answer_d": "To limit training time", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which optimization technique is commonly used with LoRA?", |
|
"answer_a": "SGD", |
|
"answer_b": "AdamW", |
|
"answer_c": "RMSprop", |
|
"answer_d": "Momentum", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the most significant difference between full fine-tuning and LoRA?", |
|
"answer_a": "LoRA updates a subset of model weights while full fine-tuning updates all weights", |
|
"answer_b": "LoRA adds new parameters while keeping original weights frozen", |
|
"answer_c": "LoRA modifies attention layers while full fine-tuning modifies feed-forward layers", |
|
"answer_d": "LoRA trains faster but requires more memory than full fine-tuning", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "When implementing chat templates, which approach is most likely to maintain model performance?", |
|
"answer_a": "Using the exact template format from the model's training data", |
|
"answer_b": "Using a simplified template with just role and content", |
|
"answer_c": "Using a standardized template across all models", |
|
"answer_d": "Using a template with additional control tokens", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "What is the key technical innovation of LoRA's rank decomposition approach?", |
|
"answer_a": "It reduces model parameters through matrix factorization", |
|
"answer_b": "It decomposes weight updates into low-rank matrices while preserving model capacity", |
|
"answer_c": "It compresses the model weights using SVD decomposition", |
|
"answer_d": "It optimizes attention mechanisms through rank reduction", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "How does the 'r' parameter in LoRA affect the training process?", |
|
"answer_a": "Higher r increases model capacity but requires more memory", |
|
"answer_b": "Lower r reduces training time but may impact performance", |
|
"answer_c": "Higher r improves convergence but increases computation", |
|
"answer_d": "Lower r decreases memory usage but may limit expressiveness", |
|
"correct_answer": "D" |
|
}, |
|
{ |
|
"question": "What is the primary consideration when choosing target_modules for LoRA?", |
|
"answer_a": "Selecting layers that most influence task-specific behavior", |
|
"answer_b": "Targeting modules with the most parameters", |
|
"answer_c": "Choosing layers closest to the model output", |
|
"answer_d": "Selecting modules with the least impact on inference speed", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "How does gradient checkpointing affect the training process in SFT?", |
|
"answer_a": "Trades computation time for reduced memory usage", |
|
"answer_b": "Reduces memory by storing fewer activation gradients", |
|
"answer_c": "Improves training stability through gradient accumulation", |
|
"answer_d": "Optimizes memory by recomputing forward passes", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "What role does lora_alpha play in the training dynamics?", |
|
"answer_a": "Controls the learning rate scaling of LoRA updates", |
|
"answer_b": "Scales the contribution of LoRA weights during inference", |
|
"answer_c": "Determines the initialization range of LoRA matrices", |
|
"answer_d": "Adjusts the gradient flow through LoRA layers", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "Which aspect of SFT datasets most influences training effectiveness?", |
|
"answer_a": "The diversity of instruction-output pairs", |
|
"answer_b": "The total number of training examples", |
|
"answer_c": "The complexity of individual instructions", |
|
"answer_d": "The length of output sequences", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "How does warmup_ratio impact the training dynamics?", |
|
"answer_a": "Prevents early overfitting by gradually increasing learning rate", |
|
"answer_b": "Stabilizes initial training by ramping up learning rate", |
|
"answer_c": "Reduces gradient variance in early training steps", |
|
"answer_d": "Improves model convergence through learning rate scheduling", |
|
"correct_answer": "B" |
|
}, |
|
{ |
|
"question": "What is the primary challenge addressed by gradient_accumulation_steps?", |
|
"answer_a": "Memory constraints limiting batch size", |
|
"answer_b": "Training instability with large learning rates", |
|
"answer_c": "Slow convergence with small batches", |
|
"answer_d": "Gradient vanishing in deep networks", |
|
"correct_answer": "A" |
|
}, |
|
{ |
|
"question": "How does BitsAndBytes quantization affect SFT training?", |
|
"answer_a": "Reduces precision while maintaining training stability", |
|
"answer_b": "Compresses weights with minimal performance impact", |
|
"answer_c": "Optimizes memory usage through dynamic quantization", |
|
"answer_d": "Balances precision and memory requirements", |
|
"correct_answer": "D" |
|
}, |
|
{ |
|
"question": "What distinguishes an effective chat template implementation?", |
|
"answer_a": "Minimal special token usage with clear role separation", |
|
"answer_b": "Consistent formatting with explicit turn boundaries", |
|
"answer_c": "Efficient token usage while maintaining context", |
|
"answer_d": "Flexible role definition with standardized markers", |
|
"correct_answer": "C" |
|
} |
|
] |