File size: 2,424 Bytes
a344f64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# CHECK_EVERY=900
# DURATION_DAYS=10
# CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
# NEPOCH_PRE=99
# NEPOCH_SFT=159
# NAME="audio-gen-train_audiogen"

# for (( i = 1; i <= $CHECK_TOTAL; i++ )) 
# do
#     RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
#     PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)

#     for STATE in "RUNNING" "PENDING" "NOT-RUN"
#     do
#         echo "===========${STATE}=========="

#         if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
#             echo ${NAME}
#         elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
#             echo ${NAME}
#         elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then

#             base_path="/lustre/fsw/portfolios/adlr/users/sreyang/ckpts/stable_llm/harmonai_train/"
#             # Find the last subfolder
#             last_subfolder=$(ls -d "$base_path"*/ | sort -V | tail -n 1)
#             # Find the last checkpoint in the subfolder
#             last_ckpt=$(ls "$last_subfolder/checkpoints/"*.ckpt | sort -V | tail -n 1)
#             echo $last_ckpt
#             sh submit_job.sh "True" $last_ckpt
#             sleep 1
#         fi
#     done
#     echo "============================"
#     sleep $CHECK_EVERY
# done

CHECK_EVERY=900
DURATION_DAYS=10
CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
NEPOCH_PRE=99
NEPOCH_SFT=159
NAME="eval"

for (( i = 1; i <= $CHECK_TOTAL; i++ )) 
do
    RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
    PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)

    for STATE in "RUNNING" "PENDING" "NOT-RUN"
    do
        echo "===========${STATE}=========="

        if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
            echo ${NAME}
        elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
            echo ${NAME}
        elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
            sh submit.sh
            sleep 1
        fi
    done
    echo "============================"
    sleep $CHECK_EVERY
done