#!/bin/bash set -e set -x SEQ_LENGTH="$1" if [ -z "$SEQ_LENGTH" ] then SEQ_LENGTH=32768 fi timestamp="$2" if [ -z "$timestamp" ] then timestamp=`date +'%Y%m%d_%H%M%S'` fi ###################################################################### export ROOT_PATH=/data/ export CODE_PATH=${ROOT_PATH}/VITA-Audio/ export LOCAL_ROOT_PATH=/data_local/ export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/ mkdir -p ${LOCAL_ROOT_PATH} mkdir -p ${LOCAL_CODE_PATH} apt install -y rsync mkdir -p ${LOCAL_CODE_PATH} rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/ cd ${LOCAL_CODE_PATH} rm -fr datasets ln -s ${ROOT_PATH}/data datasets ###################################################################### SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) source ${CODE_PATH}/scripts/set_env_ds_gpu.sh pip3 install transformers==4.48.3 #pip3 install --no-index --find-links=/data/software/ transformers==4.48.3 ###################################################################### OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/ mkdir -p ${OUTPUT_DIR} rsync -avh $0 ${OUTPUT_DIR} export HF_HOME="${ROOT_PATH}/data/HF_HOME/" mkdir -p ${HF_HOME} export HF_ENDPOINT=https://hf-mirror.com export MODELSCOPE_CACHE="${ROOT_PATH}/data/MODELSCOPE_CACHE/" mkdir -p ${MODELSCOPE_CACHE} export LC_ALL="en_US.utf8" ###################################################################### LOG=${OUTPUT_DIR}/log_node${INDEX}.txt exec &> >(tee -a "$LOG") echo Logging output to "$LOG" ###################################################################### if true #if false then MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Boost/" MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Balance/" AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer FLOW_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-decoder AUDIO_TOKENIZER_TYPE="glm4voice" export PYTHONPATH=${PYTHONPATH}:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/cosyvoice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/ fi ###################################################################### DISTRIBUTED_ARGS=" --nproc_per_node $NPROC_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ --master_port $MASTER_PORT " ###################################################################### if true #if false then apt-get update && apt install -y ffmpeg JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/llama-questions/test.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/llama-questions/ python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_text.json echo "copypaste ACC: ${JSON_PATH}" python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_speech.json echo "copypaste ACC: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/trivia_qa-audio/validation.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/trivia_qa-audio/ python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_text.json echo "copypaste ACC: ${JSON_PATH}" python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_speech.json echo "copypaste ACC: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/spoken-web-questions/test.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/spoken-web-questions/ python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_text.json echo "copypaste ACC: ${JSON_PATH}" python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_speech.json echo "copypaste ACC: ${JSON_PATH}" fi ###################################################################### if true #if false then JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.clean.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/librispeech_asr/ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt echo "copypaste WER: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.other.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/librispeech_asr/ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt echo "copypaste WER: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.clean.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/librispeech_asr/ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt echo "copypaste WER: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.other.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/librispeech_asr/ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt echo "copypaste WER: ${JSON_PATH}" fi ###################################################################### if true #if false then JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_MEETING.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/wenetspeech/ python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt echo "copypaste WER: ${JSON_PATH}" JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_NET.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/wenetspeech/ python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt echo "copypaste WER: ${JSON_PATH}" fi ###################################################################### if true #if false then JSON_PATH=${ROOT_PATH}/data/jsonl/shenyunhang/AISHELL-1/test.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/AISHELL-1/ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/_test.clean_ref.txt ${OUTPUT_DIR}/AISHELL-1/test.clean_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/test_ref.txt ${OUTPUT_DIR}/AISHELL-1/test_hyp.txt echo "copypaste WER: ${JSON_PATH}" fi ###################################################################### if true #if false then JSON_PATH=${ROOT_PATH}/data/jsonl/mythicinfinity/libritts/test.clean.jsonl torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_libritts.py \ --json_path ${JSON_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/libritts/ \ #python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt #echo "copypaste CER: ${JSON_PATH}" python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt echo "copypaste WER: ${JSON_PATH}" fi ###################################################################### if true #if false then DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/ torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \ --data_path ${DATA_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/seed-tts/ \ --speaker_prompt \ export ARNOLD_WORKER_GPU=${NPROC_PER_NODE} cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh echo "copypaste WER: ${DATA_PATH} zh" bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh echo "copypaste WER: ${DATA_PATH} hardcase" bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en echo "copypaste WER: ${DATA_PATH} en" bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ ${DATA_PATH}/wavlm_large_finetune.pth echo "copypaste SIM: ${DATA_PATH} zh" bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ ${DATA_PATH}/wavlm_large_finetune.pth echo "copypaste SIM: ${DATA_PATH} hardcase" bash cal_sim.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ ${DATA_PATH}/wavlm_large_finetune.pth echo "copypaste SIM: ${DATA_PATH} en" cd ${LOCAL_CODE_PATH} fi ###################################################################### if false then DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/ torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \ --data_path ${DATA_PATH} \ --model_name_or_path ${MODEL_NAME_OR_PATH} \ --audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \ --audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \ --flow_path ${FLOW_PATH} \ --output_dir ${OUTPUT_DIR}/seed-tts/ \ export ARNOLD_WORKER_GPU=${NPROC_PER_NODE} cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh echo "copypaste WER: ${DATA_PATH} zh" bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh echo "copypaste WER: ${DATA_PATH} hardcase" bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en echo "copypaste WER: ${DATA_PATH} en" cd ${LOCAL_CODE_PATH} fi set +x