time-series-machine-learning · MatthewMiddlehurst · Oct 16, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 16, 2024
diff --git a/_tsml_research_resources/soton/iridis/gpu_classification_experiments.sh b/_tsml_research_resources/soton/iridis/gpu_classification_experiments.sh
@@ -135,7 +135,6 @@ echo "#!/bin/bash
 . /etc/profile
 
 module load anaconda/py3.10
-module load cuda/12.3
 source activate $env_name
 
 # Input args to the default classification_experiments are in main method of

diff --git a/_tsml_research_resources/soton/iridis/gpu_clustering_experiments.sh b/_tsml_research_resources/soton/iridis/gpu_clustering_experiments.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# CHECK before each new run:
+#   datasets (list of problems)
+#   results_dir (where to check/write results)
+#   clusterers_to_run (list of clusterers to run)
+# While reading is fine, please dont write anything to the default directories in this script
+
+# Start and end for resamples
+max_folds=5
+start_fold=1
+
+# To avoid dumping 1000s of jobs in the queue we have a higher level queue
+max_num_submitted=12
+
+# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx
+queue="gpu"
+
+# The partition name may not always be the same as the queue name, i.e. batch is the queue, serial is the partition
+# This is used for the script job limit queue
+queue_alias=$queue
+
+# Enter your username and email here
+username="ajb2u23"
+mail="NONE"
+mailto="[email protected]"
+
+# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue
+max_memory=8000
+
+# Max allowable is 60 hours
+max_time="60:00:00"
+
+# Start point for the script i.e. 3 datasets, 3 clusterers = 9 jobs to submit, start_point=5 will skip to job 5
+start_point=1
+
+# Put your home directory here
+local_path="/mainfs/home/$username/"
+
+# Datasets to use and directory of data files. Default is Tony's work space, all should be able to read these. Change if you want to use different data or lists
+data_dir="$local_path/Data/"
+datasets="$local_path/DataSetLists/Clustering.txt"
+
+# Results and output file write location. Change these to reflect your own file structure
+results_dir="$local_path/ClusteringResults/results/"
+out_dir="$local_path/ClusteringResults/output/"
+
+# The python script we are running
+script_file_path="$local_path/tsml-eval/tsml_eval/experiments/clustering_experiments.py"
+
+# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md
+# Separate environments for GPU and CPU are recommended
+env_name="tsml-eval-gpu"
+
+# Clusterers to loop over. Must be seperated by a space
+# See list of potential clusterers in set_clusterer
+clusterers_to_run="kmedoids-squared kmedoids-euclidean"
+
+# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args
+# You will have to add any variable to the python call close to the bottom of the script
+# and possibly to the options handling below
+
+# generate a results file for the test data as well as train, usually slower
+generate_test_files="true"
+
+# If set for true, looks for <problem><fold>_TRAIN.ts file. This is useful for running tsml-java resamples
+predefined_folds="false"
+
+# Boolean on if to combine the test/train split
+combine_test_train_split="false"
+
+# Normalise data before fit/predict
+normalise_data="true"
+
+# ======================================================================================
+# ======================================================================================
+# Dont change anything under here (unless you want to change how the experiment
+# is working)
+# ======================================================================================
+# ======================================================================================
+
+# Set to -te to generate test files
+generate_test_files=$([ "${generate_test_files,,}" == "true" ] && echo "-te" || echo "")
+
+# Set to -pr to use predefined folds
+predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "")
+
+# Update result path to split combined test train split and test train split
+results_dir="${results_dir}$([ "${combine_test_train_split,,}" == "true" ] && echo "combine-test-train-split/" || echo "test-train-split/")"
+
+# Update out path to split combined test train split and test train split
+out_dir="${out_dir}$([ "${combine_test_train_split,,}" == "true" ] && echo "combine-test-train-split/" || echo "test-train-split/")"
+
+# Set to -utts to combine test train split
+combine_test_train_split=$([ "${combine_test_train_split,,}" == "true" ] && echo "-ctts" || echo "")
+
+# Set to -rn to normalise data
+normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "")
+
+count=0
+while read dataset; do
+for clusterer in $clusterers_to_run; do
+
+# Skip to the script start point
+((count++))
+if ((count>=start_point)); then
+
+# This is the loop to keep from dumping everything in the queue which is maintained around max_num_submitted jobs
+num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l)
+while [ "${num_jobs}" -ge "${max_num_submitted}" ]
+do
+    echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted}
+    sleep 60
+    num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l)
+done
+
+mkdir -p "${out_dir}${clusterer}/${dataset}/"
+
+# This skips jobs which have test/train files already written to the results directory. Only looks for Resamples, not Folds (old file name)
+array_jobs=""
+for (( i=start_fold-1; i<max_folds; i++ ))
+do
+    if [ -f "${results_dir}${clusterer}/Predictions/${dataset}/trainResample${i}.csv" ]; then
+        if [ "${generate_test_files}" == "true" ] && ! [ -f "${results_dir}${clusterer}/Predictions/${dataset}/testResample${i}.csv" ]; then
+            array_jobs="${array_jobs}${array_jobs:+,}$((i + 1))"
+        fi
+    else
+        array_jobs="${array_jobs}${array_jobs:+,}$((i + 1))"
+    fi
+done
+
+if [ "${array_jobs}" != "" ]; then
+
+# This creates the scrip to run the job based on the info above
+echo "#!/bin/bash
+#SBATCH --gres=gpu:1
+#SBATCH --mail-type=${mail}
+#SBATCH --mail-user=${mailto}
+#SBATCH -p ${queue}
+#SBATCH -t ${max_time}
+#SBATCH --job-name=${clusterer}${dataset}
+#SBATCH --array=${array_jobs}
+#SBATCH --mem=${max_memory}M
+#SBATCH -o ${out_dir}${clusterer}/${dataset}/%A-%a.out
+#SBATCH -e ${out_dir}${clusterer}/${dataset}/%A-%a.err
+#SBATCH --nodes=1
+
+. /etc/profile
+
+module load anaconda/py3.10
+source activate $env_name
+
+# Input args to the default clustering_experiments are in main method of
+# https://github.com/time-series-machine-learning/tsml-eval/blob/main/tsml_eval/experiments/clustering_experiments.py
+python -u ${script_file_path} ${data_dir} ${results_dir} ${clusterer} ${dataset} \$((\$SLURM_ARRAY_TASK_ID - 1)) ${generate_test_files} ${predefined_folds} ${combine_test_train_split} ${normalise_data}"  > generatedFile.sub
+
+echo "${count} ${clusterer}/${dataset}"
+
+sbatch < generatedFile.sub
+
+else
+    echo "${count} ${clusterer}/${dataset}" has finished all required resamples, skipping
+fi
+
+fi
+done
+done < ${datasets}
+
+echo Finished submitting jobs
diff --git a/_tsml_research_resources/soton/iridis/gpu_regression_experiments.sh b/_tsml_research_resources/soton/iridis/gpu_regression_experiments.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+# CHECK before each new run:
+#   datasets (list of problems)
+#   results_dir (where to check/write results)
+#   regressors_to_run (list of regressors to run)
+# While reading is fine, please dont write anything to the default directories in this script
+
+# Start and end for resamples
+max_folds=30
+start_fold=1
+
+# To avoid dumping 1000s of jobs in the queue we have a higher level queue
+max_num_submitted=100
+
+# Queue options are https://sotonac.sharepoint.com/teams/HPCCommunityWiki/SitePages/Iridis%205%20Job-submission-and-Limits-Quotas.aspx
+queue="batch"
+
+# The partition name may not always be the same as the queue name, i.e. batch is the queue, serial is the partition
+# This is used for the script job limit queue
+queue_alias=$queue
+
+# Enter your username and email here
+username="ajb2u23"
+mail="NONE"
+mailto="[email protected]"
+
+# MB for jobs, increase incrementally and try not to use more than you need. If you need hundreds of GB consider the huge memory queue
+max_memory=8000
+
+# Max allowable is 60 hours
+max_time="60:00:00"
+
+# Start point for the script i.e. 3 datasets, 3 regressors = 9 jobs to submit, start_point=5 will skip to job 5
+start_point=1
+
+# Put your home directory here
+local_path="/mainfs/home/$username/"
+
+# Datasets to use and directory of data files. Default is Tony's work space, all should be able to read these. Change if you want to use different data or lists
+data_dir="$local_path/Data/"
+datasets="$local_path/DataSetLists/Regression.txt"
+
+# Results and output file write location. Change these to reflect your own file structure
+results_dir="$local_path/RegressionResults/results/"
+out_dir="$local_path/RegressionResults/output/"
+
+# The python script we are running
+script_file_path="$local_path/tsml-eval/tsml_eval/experiments/regression_experiments.py"
+
+# Environment name, change accordingly, for set up, see https://github.com/time-series-machine-learning/tsml-eval/blob/main/_tsml_research_resources/soton/iridis/iridis_python.md
+# Separate environments for GPU and CPU are recommended
+env_name="tsml-eval-gpu"
+
+# Regressors to loop over. Must be seperated by a space
+# See list of potential regressors in set_regressor
+regressors_to_run="RocketRegressor TimeSeriesForestRegressor"
+
+# You can add extra arguments here. See tsml_eval/utils/arguments.py parse_args
+# You will have to add any variable to the python call close to the bottom of the script
+# and possibly to the options handling below
+
+# generate a results file for the train data as well as test, usually slower
+generate_train_files="false"
+
+# If set for true, looks for <problem><fold>_TRAIN.ts file. This is useful for running tsml-java resamples
+predefined_folds="false"
+
+# Normalise data before fit/predict
+normalise_data="false"
+
+# ======================================================================================
+# ======================================================================================
+# Dont change anything under here (unless you want to change how the experiment
+# is working)
+# ======================================================================================
+# ======================================================================================
+
+# Set to -tr to generate test files
+generate_train_files=$([ "${generate_train_files,,}" == "true" ] && echo "-tr" || echo "")
+
+# Set to -pr to use predefined folds
+predefined_folds=$([ "${predefined_folds,,}" == "true" ] && echo "-pr" || echo "")
+
+# Set to -rn to normalise data
+normalise_data=$([ "${normalise_data,,}" == "true" ] && echo "-rn" || echo "")
+
+count=0
+while read dataset; do
+for regressor in $regressors_to_run; do
+
+# Skip to the script start point
+((count++))
+if ((count>=start_point)); then
+
+# This is the loop to keep from dumping everything in the queue which is maintained around max_num_submitted jobs
+num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l)
+while [ "${num_jobs}" -ge "${max_num_submitted}" ]
+do
+    echo Waiting 60s, ${num_jobs} currently submitted on ${queue}, user-defined max is ${max_num_submitted}
+    sleep 60
+    num_jobs=$(squeue -u ${username} --format="%20P %5t" -r | awk '{print $2, $1}' | grep -e "R ${queue_alias}" -e "PD ${queue_alias}" | wc -l)
+done
+
+mkdir -p "${out_dir}${regressor}/${dataset}/"
+
+# This skips jobs which have test/train files already written to the results directory. Only looks for Resamples, not Folds (old file name)
+array_jobs=""
+for (( i=start_fold-1; i<max_folds; i++ ))
+do
+    if [ -f "${results_dir}${regressor}/Predictions/${dataset}/testResample${i}.csv" ]; then
+        if [ "${generate_train_files}" == "true" ] && ! [ -f "${results_dir}${regressor}/Predictions/${dataset}/trainResample${i}.csv" ]; then
+            array_jobs="${array_jobs}${array_jobs:+,}$((i + 1))"
+        fi
+    else
+        array_jobs="${array_jobs}${array_jobs:+,}$((i + 1))"
+    fi
+done
+
+if [ "${array_jobs}" != "" ]; then
+
+# This creates the scrip to run the job based on the info above
+echo "#!/bin/bash
+#SBATCH --gres=gpu:1
+#SBATCH --mail-type=${mail}
+#SBATCH --mail-user=${mailto}
+#SBATCH -p ${queue}
+#SBATCH -t ${max_time}
+#SBATCH --job-name=${regressor}${dataset}
+#SBATCH --array=${array_jobs}
+#SBATCH --mem=${max_memory}M
+#SBATCH -o ${out_dir}${regressor}/${dataset}/%A-%a.out
+#SBATCH -e ${out_dir}${regressor}/${dataset}/%A-%a.err
+#SBATCH --nodes=1
+
+. /etc/profile
+
+module load anaconda/py3.10
+source activate $env_name
+
+# Input args to the default regression_experiments are in main method of
+# https://github.com/time-series-machine-learning/tsml-eval/blob/main/tsml_eval/experiments/regression_experiments.py
+python -u ${script_file_path} ${data_dir} ${results_dir} ${regressor} ${dataset} \$((\$SLURM_ARRAY_TASK_ID - 1)) ${generate_train_files} ${predefined_folds} ${normalise_data}"  > generatedFile.sub
+
+echo "${count} ${regressor}/${dataset}"
+
+sbatch < generatedFile.sub
+
+else
+    echo "${count} ${regressor}/${dataset}" has finished all required resamples, skipping
+fi
+
+fi
+done
+done < ${datasets}
+
+echo Finished submitting jobs
diff --git a/_tsml_research_resources/soton/iridis/iridis_python.md b/_tsml_research_resources/soton/iridis/iridis_python.md
@@ -138,13 +138,9 @@ If any a dependency install is "Killed", it is likely the interactive session ha
 
 #### 5.1. tsml-eval GPU
 
-For GPU jobs we require an additional Iridis module in CUDA:
+It is recommended to use a different environment for GPU jobs. Move to the package directory and install the required packages for GPU jobs:
 
->module add cuda/12.3
-
-A specific Tensorflow version is required to match the available CUDA install.
-
->pip install --editable . tensorflow==2.17.0
+>pip install --editable . tensorflow[and-cuda] tensorrt
 
 # Running experiments
 
@@ -169,11 +165,21 @@ The default queue for CPU jobs is _batch_. Be sure to swap the _queue_alias_ to
 Do not run threaded code on the cluster without reserving whole nodes, as there is nothing to stop the job from using
 the CPU resources allocated to others. The default python file in the scripts attempts to avoid threading as much as possible. You should ensure processes are not intentionally using multiple threads if you change it.
 
-Requesting memory for a job will allocate it all on the jobs assigned node. New jobs will not be submitted to a node if the total allocated memory exceeds the amount available for the node. As such, requesting too much memory can block new jobs from using the node. This is ok if the memory is actually being used, but large amounts of memory should not be requested unless you know it will be required for the jobs you are submitting. ADA is a shared resource, and instantly requesting hundreds of GB will hurt the overall efficiency of the cluster.
+Requesting memory for a job will allocate it all on the jobs assigned node. New jobs will not be submitted to a node if the total allocated memory exceeds the amount available for the node. As such, requesting too much memory can block new jobs from using the node. This is ok if the memory is actually being used, but large amounts of memory should not be requested unless you know it will be required for the jobs you are submitting. Iridis is a shared resource, and instantly requesting hundreds of GB will hurt the overall efficiency of the cluster.
 
 ## Running `tsml-eval` GPU experiments
 
-todo
+For GPU experiments use one of the following scripts:
+
+>gpu_classification_experiments.sh
+
+>gpu_regression_experiments.sh
+
+>gpu_clustering_experiments.sh
+
+It is recommended you use different environments for CPU and GPU jobs.
+
+The default queue for GPU jobs is _gpu_.
 
 ## Monitoring jobs on Iridis