Skip to content

Commit

Permalink
Merge branch 'distributed-strategy-launcher' into python-venv
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun authored May 3, 2024
2 parents f7d30a8 + 71f79cb commit aa28da4
Show file tree
Hide file tree
Showing 12 changed files with 871 additions and 14 deletions.
5 changes: 4 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,8 @@
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": [
"./src/itwinai"
]
}
11 changes: 1 addition & 10 deletions env-files/tensorflow/createEnvJSCTF.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,7 @@ echo "system:${sysN}"
echo

cont1=false
if [ "$sysN" = 'deepv' ] ; then
ml use "$OTHERSTAGES"
ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake
cont1=true
elif [ "$sysN" = 'juwels' ] ; then
ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio cuDNN
cont1=true
elif [ "$sysN" = 'hdfml' ] ; then
#ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake
#ml Stages/2023 NVHPC/23.1 ParaStationMPI/5.8.0-1-mt NCCL/default-CUDA-11.7 cuDNN/8.6.0.163-CUDA-11.7 Python CMake
if [ "$sysN" = 'hdfml' ] ; then
ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
cont1=true
else
Expand Down
44 changes: 44 additions & 0 deletions tutorials/distributed-ml/tf-scaling-test-jube/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Benchmarking tutorial using JUBE

Benchmarking of itwinai can also be performed with the JUBE Benchmarking Environment from JSC.
The JUBE benchmarking tool is already setup in the environment files provided under `env-files`.

## Source the environment

Find the location of your environment file along with the module load commands, such as:

```bash
ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
source envAI_hdfml/bin/activate
```

## Run benchmark

The benchmarks are defined in the `general_jobsys.xml` file.
One can specify the configurations in terms of parameters such as the number of nodes.
The benchmark can be simply launched with the command:

```bash
jube run general_jobsys.xml
```

## Monitor status of benchmark run

The status of the run can be monitored with:

```bash
jube continue bench_run --id last
```

## Check results of the benchmark run

The results can be viewed with:

```bash
jube result -a bench_run --id last
```

This will create `result-csv.dat` file in the `results` folder.

The scaling and efficiency plots can be generated with the `bench_plot.ipynb` file
which takes the `result-csv.dat` file as input.
170 changes: 170 additions & 0 deletions tutorials/distributed-ml/tf-scaling-test-jube/bench_plot.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plot benchmark results of itwinai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os, pandas as pd, matplotlib.pyplot as plt, numpy as np\n",
"%matplotlib inline\n",
"pd.options.display.max_columns = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize'] = [12, 6]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df = pd.read_csv('result-csv.dat',header=0)\n",
"df.rename(columns=lambda x: x.split('[')[0], inplace=True)\n",
"\n",
"# gpus\n",
"df[\"NGPUs\"] = df[\"Nnodes\"]*4\n",
"\n",
"# speedup\n",
"df[\"Speedup - ideal\"] = df[\"Nnodes\"].astype(float)\n",
"df[\"Speedup\"] = df[\"Naet\"].iloc[0] / df[\"Naet\"]\n",
"\n",
"# efficiency\n",
"df[\"Threadscaled Sim. Time / s\"] = df[\"Naet\"] * df[\"Nnodes\"] * df[\"Nworkers\"]\n",
"df[\"Efficiency\"] = df[\"Threadscaled Sim. Time / s\"].iloc[0] / df[\"Threadscaled Sim. Time / s\"]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Overview"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ax = df.pivot_table(index=[\"NGPUs\"], columns=[\"Nworkers\"], values=\"Naet\").plot(kind=\"bar\", title=\"Runtime behaviour\");\n",
"ax.set_ylabel(\"Epoch Time / s\");\n",
"ax_abs = ax\n",
"for p in ax.patches:\n",
" ax.annotate(\"{:.2f} s\".format(p.get_height()), (p.get_x() + p.get_width()/1.33, p.get_height() * 1.01), \\\n",
" color=\"dimgray\", horizontalalignment=\"center\", verticalalignment=\"bottom\", rotation=\"vertical\")\n",
"pass"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scaling Behaviour"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ax = df.pivot_table(index=[\"NGPUs\"], columns=[\"Nworkers\"], values=\"Speedup\").plot(style=\"*-\", \\\n",
" loglog=False, title=\"Scaling behaviour\", color=\"r\", legend=False);\n",
"ax.plot(df[\"NGPUs\"].values,df[\"Speedup - ideal\"].values,ls='dashed',lw=1.0,c='k',label=\"ideal\")\n",
"\n",
"ax.legend(ncol=1, title=\"(Nworkers)\")\n",
"ax.set_xticks(df[\"NGPUs\"].values)\n",
"ax.set_yticks(df[\"Speedup - ideal\"].values)\n",
"ax.set_ylabel(r'Speedup')\n",
"ax.set_xlim((0,np.amax(df[\"NGPUs\"].values+1)))\n",
"ax.set_ylim((0,np.amax(df[\"Speedup - ideal\"].values+1)))\n",
"\n",
"pass"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Runtime Efficiencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ax = df.pivot_table(index=[\"NGPUs\"], columns=[\"Nworkers\"], values=\"Efficiency\").plot(kind=\"bar\", \\\n",
" legend=False, title=\"Runtime efficiency\")\n",
"ax.legend(ncol=1, title=\"(Ntasks, Ncells)\",loc=4)\n",
"ax.set_ylabel(\"Efficiency\");\n",
"for p, abs in zip(ax.patches, ax_abs.patches):\n",
" ax.annotate(\"{:.2f}\".format(p.get_height()), (p.get_x() + p.get_width()/1.33, p.get_height() * 1.01), \\\n",
" color=\"dimgray\", horizontalalignment=\"center\", verticalalignment=\"bottom\", rotation=\"vertical\")\n",
" ax.annotate(\"Abs: {:.1f} s\".format(abs.get_height()), (p.get_x() + p.get_width()/1.33, p.get_height() * 0.95), \\\n",
" color=\"white\", horizontalalignment=\"center\", verticalalignment=\"top\", rotation=\"vertical\")\n",
"ax.plot(df[\"NGPUs\"].values-8,df[\"Speedup - ideal\"].values*0+1,ls='dashed',lw=1.0,c='r',label=\"ideal\")\n",
"pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# EOF"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
140 changes: 140 additions & 0 deletions tutorials/distributed-ml/tf-scaling-test-jube/general_jobsys.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
<?xml version="1.0" encoding="UTF-8"?>
<jube>
<benchmark name="bench" outpath="bench_run">
<comment>General benchmark script</comment>

<!-- bench configuration -->
<parameterset name="paramset">
<!-- iterated nodes -->
<parameter name="iterNO" type="int">1,2,4,8</parameter>
<!-- iterated #workers -->
<parameter name="iterNW" type="int">8</parameter>
<!-- modify the name of the job script -->
<parameter name="script">train.py</parameter>
</parameterset>

<!-- job configuration -->
<parameterset name="executeset">
<parameter name="systemname" mode="shell">if [ -f /etc/FZJ/systemname ]; then cat /etc/FZJ/systemname | tr -d "\n"; else uname -n | head -c 3; fi</parameter>
<parameter name="submit_cmd">sbatch</parameter>
<parameter name="nodes">$iterNO</parameter>
<parameter name="nnw">$iterNW</parameter>
<parameter name="ready_file">ready</parameter>
<parameter name="job_file">jube_ddp.sh</parameter>
<parameter name="ngpu" mode="python" type="int">
{ "hdfml": 4,
}["${systemname}"]
</parameter>
<parameter name="account">intertwin</parameter>
<!-- main run -->
<parameter name="timelimit" tag="!devel">04:00:00</parameter>
<parameter name="queue" tag="!devel" mode="python">
{ "hdfml": "batch",
}["${systemname}"]
</parameter>
<!-- devel run -->
<parameter name="timelimit" tag="devel">00:10:00</parameter>
<parameter name="queue" tag="devel" mode="python">
{ "hdfml": "batch",
}["${systemname}"]
</parameter>
</parameterset>

<parameterset name="envirset">
<parameter name="load_modules" separator="!" mode="python"> {
"hdfml": "ml ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12",
}["${systemname}"]
</parameter>
<parameter name="python_env">source /p/project/intertwin/rakesh/repo_push/itwinai/envAItf_hdfml/bin/activate</parameter>
<parameter name="devices" separator="!" mode="python">{
"hdfml": "export CUDA_VISIBLE_DEVICES=0,1,2,3"
}["${systemname}"]
</parameter>
</parameterset>

<!-- load jobfile -->
<fileset name="files">
<copy>$job_file</copy>
<link>$script</link>
</fileset>

<!-- substitute jobfile -->
<substituteset name="sub_job">
<iofile in="${job_file}" out="$job_file" />
<sub source="#NODES#" dest="$nodes" />
<sub source="#READY#" dest="$ready_file" />
<sub source="#NW#" dest="$nnw" />
<sub source="#SCRIPT#" dest="$script" />
<sub source="#ACC#" dest="$account" />
<sub source="#NGPU#" dest="$ngpu" />
<sub source="#TIMELIM#" dest="$timelimit" />
<sub source="#QUEUE#" dest="$queue" />
<sub source="#MODULES#" dest="$load_modules" />
<sub source="#ENVS#" dest="$python_env" />
<sub source="#DEVICES#" dest="$devices" />
</substituteset>

<!-- operation/execution of bench -->
<step name="submit" work_dir="JUBE/${jube_benchmark_id}_${jube_wp_id}" >
<use>paramset</use>
<use>executeset</use>
<use>envirset</use>
<use>files,sub_job</use>
<do>echo "nID: $jube_wp_id"</do> <!-- shell command -->

<do done_file="$ready_file">$submit_cmd $job_file</do> <!-- shell command -->
</step>

<!-- results -->
<!-- regex pattern -->
<patternset name="pattern">
<pattern name="ID" type="int">${jube_wp_id}</pattern>
<pattern name="Nnodes" type="int">${nodes}</pattern>
<pattern name="Nworkers" type="int">${nnw}</pattern>
<pattern name="calcTime" unit="s" type="float">\s*TIMER: total epoch time:\s+$jube_pat_wrd\s*</pattern>
<pattern name="avgEpochT" unit="s" type="float">\s*TIMER: average epoch time:\s+$jube_pat_wrd\s*</pattern>
<pattern name="Naet" unit="s" type="float" mode="python">${avgEpochT}</pattern>
</patternset>

<!-- analyse -->
<analyzer name="analyse" >
<use>pattern</use> <!-- use existing patternset -->
<analyse step="submit">
<file>stdout</file>
<file>job.out</file>
</analyse>
</analyzer>

<!-- create result table in CSV-->
<result result_dir="results">
<use>analyse</use>
<table name="result-csv" style="csv" sort="jube_wp_id">
<column>ID</column>
<column>Nnodes</column>
<column>Nworkers</column>
<column format=".3f">calcTime</column>
<column format=".3f">avgEpochT</column>
<column format=".3f">Naet</column>
<column format=".1f">memoryGPU</column>
</table>
</result>

<!-- create result table -->
<result>
<use>analyse</use>
<table name="result" style="pretty" sort="jube_wp_id">
<column>ID</column>
<column>Nnodes</column>
<column>Nworkers</column>
<column format=".3f">calcTime</column>
<column format=".3f">avgEpochT</column>
<column format=".3f">Naet</column>
<column format=".1f">memoryGPU</column>
</table>
</result>

</benchmark>
</jube>

<!-- eof -->

Loading

0 comments on commit aa28da4

Please sign in to comment.