Update README.md
Browse files
README.md
CHANGED
@@ -197,7 +197,77 @@ Below are the other eval scores over steps for Llama-3.1-Tulu-3.1-8B:
|
|
197 |

|
198 |
|
199 |
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
## License and use
|
203 |
|
|
|
197 |

|
198 |
|
199 |
|
200 |
+
## Reproduction command
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
```bash
|
205 |
+
# clone and check out commit
|
206 |
+
git clone https://github.com/allenai/open-instruct.git
|
207 |
+
git checkout 3f37c29ddc97d2c108a7658692d2d2c3708ef182
|
208 |
+
|
209 |
+
# run my exact command for launching exps
|
210 |
+
for learning_rate in 5e-7; do
|
211 |
+
for beta in 0.01; do
|
212 |
+
for nspp in 16; do
|
213 |
+
for m in half-m ; do
|
214 |
+
for kl_estimator in kl3; do
|
215 |
+
local_rollout_batch_size=8
|
216 |
+
# `half-m` is the same as setting number of mini-batches to be 2.
|
217 |
+
if [ $m == "half-m" ]; then
|
218 |
+
local_mini_batch_size=$(($local_rollout_batch_size * $nspp / 2))
|
219 |
+
else
|
220 |
+
local_mini_batch_size=$(($local_rollout_batch_size * $nspp))
|
221 |
+
fi
|
222 |
+
exp_name="0204_lr_scan_grpo_math_lr_${learning_rate}_${kl_estimator}_${beta}_${nspp}_${m}_${RANDOM}"
|
223 |
+
echo $exp_name:
|
224 |
+
echo --- local_mini_batch_size=$local_mini_batch_size
|
225 |
+
echo --- num_gradient_updates=$(($local_rollout_batch_size * $nspp / $local_mini_batch_size))
|
226 |
+
python open_instruct/grpo_vllm_thread_ray_gtrl.py \
|
227 |
+
--exp_name $exp_name \
|
228 |
+
--beta $beta \
|
229 |
+
--local_mini_batch_size $local_mini_batch_size \
|
230 |
+
--number_samples_per_prompt $nspp \
|
231 |
+
--output_dir output/$exp_name \
|
232 |
+
--local_rollout_batch_size $local_rollout_batch_size \
|
233 |
+
--kl_estimator $kl_estimator \
|
234 |
+
--learning_rate $learning_rate \
|
235 |
+
--dataset_mixer_list allenai/RLVR-GSM-MATH-IF-Mixed-Constraints 1.0 \
|
236 |
+
--dataset_mixer_list_splits train \
|
237 |
+
--dataset_mixer_eval_list allenai/RLVR-GSM-MATH-IF-Mixed-Constraints 16 \
|
238 |
+
--dataset_mixer_eval_list_splits train \
|
239 |
+
--max_token_length 2048 \
|
240 |
+
--max_prompt_token_length 2048 \
|
241 |
+
--response_length 2048 \
|
242 |
+
--model_name_or_path allenai/Llama-3.1-Tulu-3-8B-DPO \
|
243 |
+
--non_stop_penalty \
|
244 |
+
--stop_token eos \
|
245 |
+
--temperature 1.0 \
|
246 |
+
--ground_truths_key ground_truth \
|
247 |
+
--chat_template_name tulu \
|
248 |
+
--sft_messages_key messages \
|
249 |
+
--total_episodes 10000000 \
|
250 |
+
--penalty_reward_value 0.0 \
|
251 |
+
--deepspeed_stage 2 \
|
252 |
+
--per_device_train_batch_size 2 \
|
253 |
+
--local_rollout_forward_batch_size 2 \
|
254 |
+
--actor_num_gpus_per_node 6 \
|
255 |
+
--num_epochs 1 \
|
256 |
+
--vllm_tensor_parallel_size 2 \
|
257 |
+
--lr_scheduler_type constant \
|
258 |
+
--apply_verifiable_reward true \
|
259 |
+
--seed 1 \
|
260 |
+
--num_evals 30 \
|
261 |
+
--save_freq 40 \
|
262 |
+
--reward_model_multiplier 0.0 \
|
263 |
+
--gradient_checkpointing \
|
264 |
+
--with_tracking
|
265 |
+
done
|
266 |
+
done
|
267 |
+
done
|
268 |
+
done
|
269 |
+
done
|
270 |
+
```
|
271 |
|
272 |
## License and use
|
273 |
|