Spaces:
Runtime error
Runtime error
[ | |
{ | |
"question": "What is Supervised Fine-Tuning (SFT) in the context of LLMs?", | |
"answer_a": "A technique to make models run faster", | |
"answer_b": "A method to train models on specific tasks using labeled data", | |
"answer_c": "A way to reduce model size", | |
"answer_d": "A process to create new model architectures", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is LoRA (Low-Rank Adaptation)?", | |
"answer_a": "A new type of transformer architecture", | |
"answer_b": "A method to compress models after training", | |
"answer_c": "An efficient fine-tuning technique that updates a small number of trainable parameters", | |
"answer_d": "A data preprocessing technique", | |
"correct_answer": "C" | |
}, | |
{ | |
"question": "What is the main advantage of using LoRA for fine-tuning?", | |
"answer_a": "It makes models more accurate than full fine-tuning", | |
"answer_b": "It reduces memory requirements and training costs while maintaining performance", | |
"answer_c": "It allows training without any original model weights", | |
"answer_d": "It automatically improves model inference speed", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "In chat templates, what is the purpose of the 'system' message?", | |
"answer_a": "To log system errors", | |
"answer_b": "To define the behavior and role of the assistant", | |
"answer_c": "To store user preferences", | |
"answer_d": "To handle technical configurations", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is a common format marker in chat templates?", | |
"answer_a": "<|im_start|>", | |
"answer_b": "{BEGIN}", | |
"answer_c": "START_CHAT", | |
"answer_d": "<<START>>", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "What is the primary purpose of SFT datasets?", | |
"answer_a": "To create new model architectures", | |
"answer_b": "To train models to follow specific instructions and generate desired outputs", | |
"answer_c": "To test model performance only", | |
"answer_d": "To compress model size", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which statement about LoRA is TRUE?", | |
"answer_a": "It requires modifying the original model architecture", | |
"answer_b": "It can only be used with small models", | |
"answer_c": "It adds low-rank matrices to existing weights during training", | |
"answer_d": "It permanently changes the base model weights", | |
"correct_answer": "C" | |
}, | |
{ | |
"question": "What is a key benefit of using standardized chat templates?", | |
"answer_a": "They make models run faster", | |
"answer_b": "They ensure consistent formatting across different model implementations", | |
"answer_c": "They reduce model size", | |
"answer_d": "They eliminate the need for tokenization", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "In the context of SFT, what is a 'prompt-completion' pair?", | |
"answer_a": "Two separate models working together", | |
"answer_b": "A training example consisting of an input and its desired output", | |
"answer_c": "A type of model architecture", | |
"answer_d": "A method to compress training data", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is NOT a typical component of a chat template?", | |
"answer_a": "System message", | |
"answer_b": "User message", | |
"answer_c": "Assistant message", | |
"answer_d": "Database message", | |
"correct_answer": "D" | |
}, | |
{ | |
"question": "What is the purpose of the SFTTrainer in the TRL library?", | |
"answer_a": "To train models from scratch", | |
"answer_b": "To simplify the process of fine-tuning language models on instruction datasets", | |
"answer_c": "To evaluate model performance", | |
"answer_d": "To compress model weights", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is a key characteristic of LoRA's training approach?", | |
"answer_a": "It trains all model parameters", | |
"answer_b": "It only works with small models", | |
"answer_c": "It freezes the original model weights and injects trainable rank decomposition matrices", | |
"answer_d": "It requires multiple GPUs", | |
"correct_answer": "C" | |
}, | |
{ | |
"question": "Which parameter in LoRA determines the size of the rank decomposition matrices?", | |
"answer_a": "lora_alpha", | |
"answer_b": "r", | |
"answer_c": "dropout", | |
"answer_d": "bias", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the role of 'target_modules' in LoRA configuration?", | |
"answer_a": "To specify which layers to remove", | |
"answer_b": "To define which layers will be adapted with LoRA", | |
"answer_c": "To set the learning rate for each layer", | |
"answer_d": "To determine model output", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the purpose of chat template's 'add_generation_prompt' parameter?", | |
"answer_a": "To end the conversation", | |
"answer_b": "To add a prompt for the model to continue generating", | |
"answer_c": "To change the system message", | |
"answer_d": "To modify user input", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "In SFT training, what is gradient checkpointing used for?", | |
"answer_a": "To save training progress", | |
"answer_b": "To reduce memory usage during training", | |
"answer_c": "To increase model accuracy", | |
"answer_d": "To speed up training", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the purpose of the 'lora_alpha' parameter in LoRA?", | |
"answer_a": "To set the learning rate", | |
"answer_b": "To scale the LoRA weights during inference", | |
"answer_c": "To determine batch size", | |
"answer_d": "To control model size", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is a benefit of using the SFTTrainer?", | |
"answer_a": "It automatically handles padding and truncation of inputs", | |
"answer_b": "It creates new model architectures", | |
"answer_c": "It performs unsupervised learning", | |
"answer_d": "It generates training data", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "What is the purpose of 'formatting_func' in SFTTrainer?", | |
"answer_a": "To format the output text", | |
"answer_b": "To preprocess and structure the training data", | |
"answer_c": "To modify model architecture", | |
"answer_d": "To handle error messages", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is TRUE about LoRA training?", | |
"answer_a": "It requires more memory than full fine-tuning", | |
"answer_b": "It can only be used with specific model architectures", | |
"answer_c": "It allows efficient adaptation while keeping original weights frozen", | |
"answer_d": "It always produces better results than full fine-tuning", | |
"correct_answer": "C" | |
}, | |
{ | |
"question": "What is the purpose of 'max_seq_length' in SFTTrainer?", | |
"answer_a": "To limit the model's vocabulary size", | |
"answer_b": "To set the maximum length of input sequences", | |
"answer_c": "To determine the batch size", | |
"answer_d": "To control the learning rate", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "In chat templates, what is the purpose of conversation history?", | |
"answer_a": "To store user preferences", | |
"answer_b": "To maintain context across multiple turns of dialogue", | |
"answer_c": "To track error messages", | |
"answer_d": "To count tokens", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is a key advantage of using BitsAndBytes for SFT?", | |
"answer_a": "It makes training faster", | |
"answer_b": "It reduces memory usage through quantization", | |
"answer_c": "It improves model accuracy", | |
"answer_d": "It simplifies the code", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is NOT a typical parameter in LoRA configuration?", | |
"answer_a": "r", | |
"answer_b": "lora_alpha", | |
"answer_c": "model_size", | |
"answer_d": "target_modules", | |
"correct_answer": "C" | |
}, | |
{ | |
"question": "What is the purpose of 'warmup_ratio' in training arguments?", | |
"answer_a": "To set the final learning rate", | |
"answer_b": "To determine the portion of training used for learning rate warmup", | |
"answer_c": "To control model temperature", | |
"answer_d": "To set the batch size", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which statement about SFT datasets is TRUE?", | |
"answer_a": "They must always be in JSON format", | |
"answer_b": "They typically contain input-output pairs for training", | |
"answer_c": "They can only contain single-turn conversations", | |
"answer_d": "They must include system prompts", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the role of 'gradient_accumulation_steps' in training?", | |
"answer_a": "To speed up training", | |
"answer_b": "To simulate larger batch sizes with limited memory", | |
"answer_c": "To reduce model size", | |
"answer_d": "To improve accuracy", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which of these is a common use case for LoRA?", | |
"answer_a": "Creating new model architectures", | |
"answer_b": "Adapting large models to specific tasks efficiently", | |
"answer_c": "Reducing model inference time", | |
"answer_d": "Generating training data", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the purpose of 'save_total_limit' in training arguments?", | |
"answer_a": "To limit the model's vocabulary", | |
"answer_b": "To control how many checkpoints are saved during training", | |
"answer_c": "To set the maximum sequence length", | |
"answer_d": "To limit training time", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which optimization technique is commonly used with LoRA?", | |
"answer_a": "SGD", | |
"answer_b": "AdamW", | |
"answer_c": "RMSprop", | |
"answer_d": "Momentum", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the most significant difference between full fine-tuning and LoRA?", | |
"answer_a": "LoRA updates a subset of model weights while full fine-tuning updates all weights", | |
"answer_b": "LoRA adds new parameters while keeping original weights frozen", | |
"answer_c": "LoRA modifies attention layers while full fine-tuning modifies feed-forward layers", | |
"answer_d": "LoRA trains faster but requires more memory than full fine-tuning", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "When implementing chat templates, which approach is most likely to maintain model performance?", | |
"answer_a": "Using the exact template format from the model's training data", | |
"answer_b": "Using a simplified template with just role and content", | |
"answer_c": "Using a standardized template across all models", | |
"answer_d": "Using a template with additional control tokens", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "What is the key technical innovation of LoRA's rank decomposition approach?", | |
"answer_a": "It reduces model parameters through matrix factorization", | |
"answer_b": "It decomposes weight updates into low-rank matrices while preserving model capacity", | |
"answer_c": "It compresses the model weights using SVD decomposition", | |
"answer_d": "It optimizes attention mechanisms through rank reduction", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "How does the 'r' parameter in LoRA affect the training process?", | |
"answer_a": "Higher r increases model capacity but requires more memory", | |
"answer_b": "Lower r reduces training time but may impact performance", | |
"answer_c": "Higher r improves convergence but increases computation", | |
"answer_d": "Lower r decreases memory usage but may limit expressiveness", | |
"correct_answer": "D" | |
}, | |
{ | |
"question": "What is the primary consideration when choosing target_modules for LoRA?", | |
"answer_a": "Selecting layers that most influence task-specific behavior", | |
"answer_b": "Targeting modules with the most parameters", | |
"answer_c": "Choosing layers closest to the model output", | |
"answer_d": "Selecting modules with the least impact on inference speed", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "How does gradient checkpointing affect the training process in SFT?", | |
"answer_a": "Trades computation time for reduced memory usage", | |
"answer_b": "Reduces memory by storing fewer activation gradients", | |
"answer_c": "Improves training stability through gradient accumulation", | |
"answer_d": "Optimizes memory by recomputing forward passes", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "What role does lora_alpha play in the training dynamics?", | |
"answer_a": "Controls the learning rate scaling of LoRA updates", | |
"answer_b": "Scales the contribution of LoRA weights during inference", | |
"answer_c": "Determines the initialization range of LoRA matrices", | |
"answer_d": "Adjusts the gradient flow through LoRA layers", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "Which aspect of SFT datasets most influences training effectiveness?", | |
"answer_a": "The diversity of instruction-output pairs", | |
"answer_b": "The total number of training examples", | |
"answer_c": "The complexity of individual instructions", | |
"answer_d": "The length of output sequences", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "How does warmup_ratio impact the training dynamics?", | |
"answer_a": "Prevents early overfitting by gradually increasing learning rate", | |
"answer_b": "Stabilizes initial training by ramping up learning rate", | |
"answer_c": "Reduces gradient variance in early training steps", | |
"answer_d": "Improves model convergence through learning rate scheduling", | |
"correct_answer": "B" | |
}, | |
{ | |
"question": "What is the primary challenge addressed by gradient_accumulation_steps?", | |
"answer_a": "Memory constraints limiting batch size", | |
"answer_b": "Training instability with large learning rates", | |
"answer_c": "Slow convergence with small batches", | |
"answer_d": "Gradient vanishing in deep networks", | |
"correct_answer": "A" | |
}, | |
{ | |
"question": "How does BitsAndBytes quantization affect SFT training?", | |
"answer_a": "Reduces precision while maintaining training stability", | |
"answer_b": "Compresses weights with minimal performance impact", | |
"answer_c": "Optimizes memory usage through dynamic quantization", | |
"answer_d": "Balances precision and memory requirements", | |
"correct_answer": "D" | |
}, | |
{ | |
"question": "What distinguishes an effective chat template implementation?", | |
"answer_a": "Minimal special token usage with clear role separation", | |
"answer_b": "Consistent formatting with explicit turn boundaries", | |
"answer_c": "Efficient token usage while maintaining context", | |
"answer_d": "Flexible role definition with standardized markers", | |
"correct_answer": "C" | |
} | |
] |