supervised_finetuning_quiz

Runtime error

File size: 15,814 Bytes

c0d911e

[
    {
        "question": "What is Supervised Fine-Tuning (SFT) in the context of LLMs?",
        "answer_a": "A technique to make models run faster",
        "answer_b": "A method to train models on specific tasks using labeled data",
        "answer_c": "A way to reduce model size",
        "answer_d": "A process to create new model architectures",
        "correct_answer": "B"
    },
    {
        "question": "What is LoRA (Low-Rank Adaptation)?",
        "answer_a": "A new type of transformer architecture",
        "answer_b": "A method to compress models after training",
        "answer_c": "An efficient fine-tuning technique that updates a small number of trainable parameters",
        "answer_d": "A data preprocessing technique",
        "correct_answer": "C"
    },
    {
        "question": "What is the main advantage of using LoRA for fine-tuning?",
        "answer_a": "It makes models more accurate than full fine-tuning",
        "answer_b": "It reduces memory requirements and training costs while maintaining performance",
        "answer_c": "It allows training without any original model weights",
        "answer_d": "It automatically improves model inference speed",
        "correct_answer": "B"
    },
    {
        "question": "In chat templates, what is the purpose of the 'system' message?",
        "answer_a": "To log system errors",
        "answer_b": "To define the behavior and role of the assistant",
        "answer_c": "To store user preferences",
        "answer_d": "To handle technical configurations",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is a common format marker in chat templates?",
        "answer_a": "<|im_start|>",
        "answer_b": "{BEGIN}",
        "answer_c": "START_CHAT",
        "answer_d": "<<START>>",
        "correct_answer": "A"
    },
    {
        "question": "What is the primary purpose of SFT datasets?",
        "answer_a": "To create new model architectures",
        "answer_b": "To train models to follow specific instructions and generate desired outputs",
        "answer_c": "To test model performance only",
        "answer_d": "To compress model size",
        "correct_answer": "B"
    },
    {
        "question": "Which statement about LoRA is TRUE?",
        "answer_a": "It requires modifying the original model architecture",
        "answer_b": "It can only be used with small models",
        "answer_c": "It adds low-rank matrices to existing weights during training",
        "answer_d": "It permanently changes the base model weights",
        "correct_answer": "C"
    },
    {
        "question": "What is a key benefit of using standardized chat templates?",
        "answer_a": "They make models run faster",
        "answer_b": "They ensure consistent formatting across different model implementations",
        "answer_c": "They reduce model size",
        "answer_d": "They eliminate the need for tokenization",
        "correct_answer": "B"
    },
    {
        "question": "In the context of SFT, what is a 'prompt-completion' pair?",
        "answer_a": "Two separate models working together",
        "answer_b": "A training example consisting of an input and its desired output",
        "answer_c": "A type of model architecture",
        "answer_d": "A method to compress training data",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is NOT a typical component of a chat template?",
        "answer_a": "System message",
        "answer_b": "User message",
        "answer_c": "Assistant message",
        "answer_d": "Database message",
        "correct_answer": "D"
    },
    {
        "question": "What is the purpose of the SFTTrainer in the TRL library?",
        "answer_a": "To train models from scratch",
        "answer_b": "To simplify the process of fine-tuning language models on instruction datasets",
        "answer_c": "To evaluate model performance",
        "answer_d": "To compress model weights",
        "correct_answer": "B"
    },
    {
        "question": "What is a key characteristic of LoRA's training approach?",
        "answer_a": "It trains all model parameters",
        "answer_b": "It only works with small models",
        "answer_c": "It freezes the original model weights and injects trainable rank decomposition matrices",
        "answer_d": "It requires multiple GPUs",
        "correct_answer": "C"
    },
    {
        "question": "Which parameter in LoRA determines the size of the rank decomposition matrices?",
        "answer_a": "lora_alpha",
        "answer_b": "r",
        "answer_c": "dropout",
        "answer_d": "bias",
        "correct_answer": "B"
    },
    {
        "question": "What is the role of 'target_modules' in LoRA configuration?",
        "answer_a": "To specify which layers to remove",
        "answer_b": "To define which layers will be adapted with LoRA",
        "answer_c": "To set the learning rate for each layer",
        "answer_d": "To determine model output",
        "correct_answer": "B"
    },
    {
        "question": "What is the purpose of chat template's 'add_generation_prompt' parameter?",
        "answer_a": "To end the conversation",
        "answer_b": "To add a prompt for the model to continue generating",
        "answer_c": "To change the system message",
        "answer_d": "To modify user input",
        "correct_answer": "B"
    },
    {
        "question": "In SFT training, what is gradient checkpointing used for?",
        "answer_a": "To save training progress",
        "answer_b": "To reduce memory usage during training",
        "answer_c": "To increase model accuracy",
        "answer_d": "To speed up training",
        "correct_answer": "B"
    },
    {
        "question": "What is the purpose of the 'lora_alpha' parameter in LoRA?",
        "answer_a": "To set the learning rate",
        "answer_b": "To scale the LoRA weights during inference",
        "answer_c": "To determine batch size",
        "answer_d": "To control model size",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is a benefit of using the SFTTrainer?",
        "answer_a": "It automatically handles padding and truncation of inputs",
        "answer_b": "It creates new model architectures",
        "answer_c": "It performs unsupervised learning",
        "answer_d": "It generates training data",
        "correct_answer": "A"
    },
    {
        "question": "What is the purpose of 'formatting_func' in SFTTrainer?",
        "answer_a": "To format the output text",
        "answer_b": "To preprocess and structure the training data",
        "answer_c": "To modify model architecture",
        "answer_d": "To handle error messages",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is TRUE about LoRA training?",
        "answer_a": "It requires more memory than full fine-tuning",
        "answer_b": "It can only be used with specific model architectures",
        "answer_c": "It allows efficient adaptation while keeping original weights frozen",
        "answer_d": "It always produces better results than full fine-tuning",
        "correct_answer": "C"
    },
    {
        "question": "What is the purpose of 'max_seq_length' in SFTTrainer?",
        "answer_a": "To limit the model's vocabulary size",
        "answer_b": "To set the maximum length of input sequences",
        "answer_c": "To determine the batch size",
        "answer_d": "To control the learning rate",
        "correct_answer": "B"
    },
    {
        "question": "In chat templates, what is the purpose of conversation history?",
        "answer_a": "To store user preferences",
        "answer_b": "To maintain context across multiple turns of dialogue",
        "answer_c": "To track error messages",
        "answer_d": "To count tokens",
        "correct_answer": "B"
    },
    {
        "question": "What is a key advantage of using BitsAndBytes for SFT?",
        "answer_a": "It makes training faster",
        "answer_b": "It reduces memory usage through quantization",
        "answer_c": "It improves model accuracy",
        "answer_d": "It simplifies the code",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is NOT a typical parameter in LoRA configuration?",
        "answer_a": "r",
        "answer_b": "lora_alpha",
        "answer_c": "model_size",
        "answer_d": "target_modules",
        "correct_answer": "C"
    },
    {
        "question": "What is the purpose of 'warmup_ratio' in training arguments?",
        "answer_a": "To set the final learning rate",
        "answer_b": "To determine the portion of training used for learning rate warmup",
        "answer_c": "To control model temperature",
        "answer_d": "To set the batch size",
        "correct_answer": "B"
    },
    {
        "question": "Which statement about SFT datasets is TRUE?",
        "answer_a": "They must always be in JSON format",
        "answer_b": "They typically contain input-output pairs for training",
        "answer_c": "They can only contain single-turn conversations",
        "answer_d": "They must include system prompts",
        "correct_answer": "B"
    },
    {
        "question": "What is the role of 'gradient_accumulation_steps' in training?",
        "answer_a": "To speed up training",
        "answer_b": "To simulate larger batch sizes with limited memory",
        "answer_c": "To reduce model size",
        "answer_d": "To improve accuracy",
        "correct_answer": "B"
    },
    {
        "question": "Which of these is a common use case for LoRA?",
        "answer_a": "Creating new model architectures",
        "answer_b": "Adapting large models to specific tasks efficiently",
        "answer_c": "Reducing model inference time",
        "answer_d": "Generating training data",
        "correct_answer": "B"
    },
    {
        "question": "What is the purpose of 'save_total_limit' in training arguments?",
        "answer_a": "To limit the model's vocabulary",
        "answer_b": "To control how many checkpoints are saved during training",
        "answer_c": "To set the maximum sequence length",
        "answer_d": "To limit training time",
        "correct_answer": "B"
    },
    {
        "question": "Which optimization technique is commonly used with LoRA?",
        "answer_a": "SGD",
        "answer_b": "AdamW",
        "answer_c": "RMSprop",
        "answer_d": "Momentum",
        "correct_answer": "B"
    },
    {
        "question": "What is the most significant difference between full fine-tuning and LoRA?",
        "answer_a": "LoRA updates a subset of model weights while full fine-tuning updates all weights",
        "answer_b": "LoRA adds new parameters while keeping original weights frozen",
        "answer_c": "LoRA modifies attention layers while full fine-tuning modifies feed-forward layers",
        "answer_d": "LoRA trains faster but requires more memory than full fine-tuning",
        "correct_answer": "B"
    },
    {
        "question": "When implementing chat templates, which approach is most likely to maintain model performance?",
        "answer_a": "Using the exact template format from the model's training data",
        "answer_b": "Using a simplified template with just role and content",
        "answer_c": "Using a standardized template across all models",
        "answer_d": "Using a template with additional control tokens",
        "correct_answer": "A"
    },
    {
        "question": "What is the key technical innovation of LoRA's rank decomposition approach?",
        "answer_a": "It reduces model parameters through matrix factorization",
        "answer_b": "It decomposes weight updates into low-rank matrices while preserving model capacity",
        "answer_c": "It compresses the model weights using SVD decomposition",
        "answer_d": "It optimizes attention mechanisms through rank reduction",
        "correct_answer": "B"
    },
    {
        "question": "How does the 'r' parameter in LoRA affect the training process?",
        "answer_a": "Higher r increases model capacity but requires more memory",
        "answer_b": "Lower r reduces training time but may impact performance",
        "answer_c": "Higher r improves convergence but increases computation",
        "answer_d": "Lower r decreases memory usage but may limit expressiveness",
        "correct_answer": "D"
    },
    {
        "question": "What is the primary consideration when choosing target_modules for LoRA?",
        "answer_a": "Selecting layers that most influence task-specific behavior",
        "answer_b": "Targeting modules with the most parameters",
        "answer_c": "Choosing layers closest to the model output",
        "answer_d": "Selecting modules with the least impact on inference speed",
        "correct_answer": "A"
    },
    {
        "question": "How does gradient checkpointing affect the training process in SFT?",
        "answer_a": "Trades computation time for reduced memory usage",
        "answer_b": "Reduces memory by storing fewer activation gradients",
        "answer_c": "Improves training stability through gradient accumulation",
        "answer_d": "Optimizes memory by recomputing forward passes",
        "correct_answer": "A"
    },
    {
        "question": "What role does lora_alpha play in the training dynamics?",
        "answer_a": "Controls the learning rate scaling of LoRA updates",
        "answer_b": "Scales the contribution of LoRA weights during inference",
        "answer_c": "Determines the initialization range of LoRA matrices",
        "answer_d": "Adjusts the gradient flow through LoRA layers",
        "correct_answer": "B"
    },
    {
        "question": "Which aspect of SFT datasets most influences training effectiveness?",
        "answer_a": "The diversity of instruction-output pairs",
        "answer_b": "The total number of training examples",
        "answer_c": "The complexity of individual instructions",
        "answer_d": "The length of output sequences",
        "correct_answer": "A"
    },
    {
        "question": "How does warmup_ratio impact the training dynamics?",
        "answer_a": "Prevents early overfitting by gradually increasing learning rate",
        "answer_b": "Stabilizes initial training by ramping up learning rate",
        "answer_c": "Reduces gradient variance in early training steps",
        "answer_d": "Improves model convergence through learning rate scheduling",
        "correct_answer": "B"
    },
    {
        "question": "What is the primary challenge addressed by gradient_accumulation_steps?",
        "answer_a": "Memory constraints limiting batch size",
        "answer_b": "Training instability with large learning rates",
        "answer_c": "Slow convergence with small batches",
        "answer_d": "Gradient vanishing in deep networks",
        "correct_answer": "A"
    },
    {
        "question": "How does BitsAndBytes quantization affect SFT training?",
        "answer_a": "Reduces precision while maintaining training stability",
        "answer_b": "Compresses weights with minimal performance impact",
        "answer_c": "Optimizes memory usage through dynamic quantization",
        "answer_d": "Balances precision and memory requirements",
        "correct_answer": "D"
    },
    {
        "question": "What distinguishes an effective chat template implementation?",
        "answer_a": "Minimal special token usage with clear role separation",
        "answer_b": "Consistent formatting with explicit turn boundaries",
        "answer_c": "Efficient token usage while maintaining context",
        "answer_d": "Flexible role definition with standardized markers",
        "correct_answer": "C"
    }
]