From 3aa6591b32394807f84aade01b1d96bdc01d4c8b Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Wed, 21 Sep 2022 15:40:41 +0800 Subject: [PATCH 1/2] update tipc log --- test_tipc/common_func.sh | 6 ++- test_tipc/test_train_inference_python.sh | 47 +++++++++++++++--------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/test_tipc/common_func.sh b/test_tipc/common_func.sh index afd33b406..ed36080b3 100755 --- a/test_tipc/common_func.sh +++ b/test_tipc/common_func.sh @@ -65,9 +65,11 @@ function status_check(){ last_status=$1 # the exit code run_command=$2 run_log=$3 + model_name=$4 + log_path=$5 if [ $last_status -eq 0 ]; then - echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log} + echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} else - echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log} + echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} fi } diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index 8f431749d..2f0126e85 100755 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -136,7 +136,8 @@ if [ ${MODE} = "klquant_whole_infer" ]; then infer_value1=$(func_parser_value "${lines[17]}") fi -LOG_PATH="./test_tipc/output" +WORK_PATH=$(pwd) +LOG_PATH="$(pwd)/test_tipc/output" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results_python.log" @@ -149,6 +150,7 @@ function func_inference(){ _log_path=$4 _img_dir=$5 _flag_quant=$6 + _gpu=$7 # inference for use_gpu in ${use_gpu_list[*]}; do if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then @@ -167,7 +169,7 @@ function func_inference(){ fi # skip when quant model inference but precision is not int8 set_precision=$(func_set_params "${precision_key}" "${precision}") - _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + _save_log_path="${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") @@ -178,7 +180,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done done @@ -196,7 +198,7 @@ function func_inference(){ continue fi for batch_size in ${batch_size_list[*]}; do - _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + _save_log_path="${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") @@ -208,7 +210,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done @@ -391,8 +393,8 @@ else set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}") if [ ${#ips} -le 26 ];then - save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" nodes=1 + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" else IFS="," ips_array=(${ips}) @@ -408,10 +410,11 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu - cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + _train_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}.log" + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} >${_train_log} 2>&1" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd - status_check $? "${cmd}" "${status_log}" + status_check $? "${cmd}" "${status_log}" "${model_name}" "${_train_log}" elif [ ${#ips} -le 26 ];then # train with multi-gpu # run pserver @@ -421,10 +424,11 @@ else cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} echo "PADDLE WILL START PSERVER "$cur_port export PADDLE_PORT=${cur_port} - cmd="${python} ${SC}" + _train_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_pserver_${cur_port}.log" + cmd="${python} ${SC} >${_train_log} 2>&1" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd - status_check $? "${cmd}" "${status_log}" + status_check $? "${cmd}" "${status_log}" "${model_name}" "${_train_log}" done # run trainer @@ -433,16 +437,20 @@ else do echo "PADDLE WILL START Trainer "$i export PADDLE_TRAINER_ID=$i - cmd="${python} ${SC}" + _train_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_trainer_${i}.log" + cmd="${python} ${SC} ${_train_log} 2>&1" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd - status_check $? "${cmd}" "${status_log}" + status_check $? "${cmd}" "${status_log}" "${model_name}" "${_train_log}" done else # train with multi-machine + _train_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}.log" cmd="${python} -m paddle.distributed.launch --ips=${ips} --devices=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd - status_check $? "${cmd}" "${status_log}" + last_status=${PIPESTATUS[0]} + cat ${WORK_PATH}/log/workerlog.0 + status_check ${last_status} "${cmd}" "${status_log}" "${model_name}" "${_train_log}" fi # run train @@ -454,20 +462,23 @@ else fi # run eval if [ ${eval_py} != "null" ]; then + _eval_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log" set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") - eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} >${_eval_log} 2>&1" eval $eval_cmd - status_check $? "${eval_cmd}" "${status_log}" + status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${_eval_log}" fi # run export model if [ ${run_export} != "null" ]; then # run export model + _export_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log" + set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") save_infer_path="${save_log}" set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}") set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") - export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}" + export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} >${_export_log} 2>&1" eval $export_cmd - status_check $? "${export_cmd}" "${status_log}" + status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${_export_log}" #run inference eval $env @@ -477,7 +488,7 @@ else else infer_model_dir=${save_infer_path} fi - func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" + func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" "${gpu}" eval "unset CUDA_VISIBLE_DEVICES" fi From b704ebb200d71e2a77638e38add59332881384c7 Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Wed, 21 Sep 2022 16:28:17 +0800 Subject: [PATCH 2/2] update tipc log --- test_tipc/test_train_inference_python.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index 2f0126e85..eb5bb62be 100755 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -137,7 +137,7 @@ if [ ${MODE} = "klquant_whole_infer" ]; then fi WORK_PATH=$(pwd) -LOG_PATH="$(pwd)/test_tipc/output" +LOG_PATH="$(pwd)/test_tipc/output/${model_name}/${MODE}" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results_python.log"