diff --git a/ansible.cfg b/ansible.cfg index 9af49e2c..f5b7f9a7 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -1,4 +1,6 @@ [defaults] +host_key_checking = False +callback_whitelist = profile_tasks # include roles from galaxyproject/ansible-common-roles roles_path = roles:common_roles @@ -17,3 +19,6 @@ transport = ssh # These are necessary for cloud instances #pipelining = False #ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no + +[paramiko_connection] +record_host_keys = False diff --git a/jetstream_common/files/elasticity/cloudbridge b/jetstream_common/files/elasticity/cloudbridge new file mode 100644 index 00000000..1bb4aec6 --- /dev/null +++ b/jetstream_common/files/elasticity/cloudbridge @@ -0,0 +1,15 @@ +$ANSIBLE_VAULT;1.1;AES256 +30613566326532613235643863643061666535326538303961656161383562613538666335353338 +3732646138623733353131626236616662303661626266620a373630623662346430623734656131 +66363662643339383161663664396338633130383566323936653861363830393833653939623563 +3431316239313237610a303866396566656638613136626438373364326235346639643466323937 +66623562353539363864646639613036306335316138616332656263393564616463323839306531 +66636465626539366661343261633530633337653030633864663135323033626333316166366535 +65326137623864643265393465383139396538363835356232303162383935356139333738343230 +34363136343137646566653635376663343939316436613762313337353936656236366162383232 +39663239396633373066323638666135623964393361666539303032343539383335313936333739 +63313266346562646433303238303232386435643562343164346533323265313136653263393734 +66333636386336623265333263636132323331326663396436326230636236316136323265383236 +32646165386631353538333365383964646163623631333762333961343733393135396563313363 +63663334333163663735323064353166343930343639383930393061623966656665346636613665 +3332336361646631316431653635666536336538633936663365 diff --git a/jetstream_common/files/elasticity/elasticity_kp b/jetstream_common/files/elasticity/elasticity_kp new file mode 100644 index 00000000..1620a97e --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp @@ -0,0 +1,90 @@ +$ANSIBLE_VAULT;1.1;AES256 +64646363613565643537363537623663373961326364366636343061323666396235326430623538 +6263373731333466383561386139653035313362326237660a366639333261643337643332666164 +32326564623561663435666162343930396634656130313039363930353664666336323839356339 +3566623532316233630adiff --git a/jetstream_common/files/elasticity/elasticity_kp.pub b/jetstream_common/files/elasticity/elasticity_kp.pub new file mode 100644 index 00000000..92b4fb7d --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpw4D9NR0at80ehc84kTchP2aDCV9HbZetptVCwc2/sdzUHvecUXNRc61RYKttF3kdgWO8UhGWjUUjNCMOR/gYxV1Ui1hpihFExdg2tHbXpkArdrHJK6n+QWf7qYrNTLFVFH2XVxouzJ4k37slGNeQMWczRHS2ZUL+cbfYOEWe8RhWRqhRjvAdxLnQt1dg/sKc0MqeDwEVAmdgNHUozbeUxKVVsoxWBeUiLK7xTd0PQ/jXRVWY1pYQ2xfjvhm1DiylQO/5fox8Z5MmyqBSAsKdGgfe47K335QmmrEKtq/6O1AAnm6D0Pkqky3EgzT7g/DpT2n8VyCWAdFIvrQAg65V Generated-by-Nova diff --git a/jetstream_common/files/elasticity/vp b/jetstream_common/files/elasticity/vp new file mode 100644 index 00000000..b82b0bc4 --- /dev/null +++ b/jetstream_common/files/elasticity/vp @@ -0,0 +1,7 @@ +$ANSIBLE_VAULT;1.1;AES256 +39343936396638626438353764366237336637336537383966393234303534393730303635653564 +3837353262653931633761336561623666653464303032630a323933393438376438663332666238 +34333133326533313136363935373837376538636230653239353430356330633037393766373763 +3031376537643338370a333939656563646539356563306666306131303638666666633036383161 +34636336623039376638633430353138363031366230633833633234616238613165303534336464 +3634326232303961313037333636646562393732666334653033 diff --git a/jetstream_common/files/slurm/launch b/jetstream_common/files/slurm/launch index 68f7f223..5c73d26e 100755 --- a/jetstream_common/files/slurm/launch +++ b/jetstream_common/files/slurm/launch @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,5 +17,5 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" ) 2>&1 | tee "$LOG_DIR/launch.`date +%s`.${1}.log" diff --git a/jetstream_common/files/slurm/terminate b/jetstream_common/files/slurm/terminate index 1bb954c1..4b78dc79 100755 --- a/jetstream_common/files/slurm/terminate +++ b/jetstream_common/files/slurm/terminate @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,6 +17,6 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" ) 2>&1 | tee "$LOG_DIR/terminate`date +%s`.${1}.log" true diff --git a/jetstream_common/launch.yml b/jetstream_common/launch.yml index ad990ec2..e128d4b4 100644 --- a/jetstream_common/launch.yml +++ b/jetstream_common/launch.yml @@ -8,14 +8,14 @@ tasks: - name: Launch new instance(s) os_server: - cloud: jetstream_iu + cloud: jetstream_iu # Details defined in clouds.yaml, including auth name: "{{ item }}" - image: "CentOS 7 Stock 1601" - flavor: "m1.large" - key_name: "slurm_jetstream0" - security_groups: "default,usegalaxy-control" + image: "{{ worker_image_id }}" + flavor: "{{ worker_instance_type }}" + key_name: "elasticity_kp" + security_groups: "gxy-workers-sg" nics: - - net-name: "usegalaxy" + - net-name: "gxy-slurm-net" auto_ip: no with_items: "{{ jetstream_instances_to_launch.split(',') }}" register: jetstream_instances_launched @@ -29,7 +29,8 @@ add_host: name: "{{ item.server.name }}" ansible_host: "{{ item.server.private_v4 }}" - groups: "baseenv,galaxynodes,slurmclients,slurmexechosts" + groups: + "baseenv,galaxynodes,slurmclients,slurmexechosts,jetstreamnfsclients" with_items: "{{ jetstream_instances_launched.results }}" - name: Spin waiting for instance(s) to become accessible diff --git a/jetstream_common/playbook.yml b/jetstream_common/playbook.yml index 5ee720e7..110a57c7 100644 --- a/jetstream_common/playbook.yml +++ b/jetstream_common/playbook.yml @@ -5,6 +5,7 @@ remote_user: centos become: yes become_method: sudo + connection: paramiko pre_tasks: - name: Locate secret group variable files local_action: @@ -65,7 +66,7 @@ dest: /etc/hosts - name: Install supervisor - hosts: all + hosts: controllers remote_user: centos become: yes become_method: sudo diff --git a/jetstream_common/secret_group_vars/clouds.yaml b/jetstream_common/secret_group_vars/clouds.yaml new file mode 100644 index 00000000..b4840d3a --- /dev/null +++ b/jetstream_common/secret_group_vars/clouds.yaml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +39376264656237656561343363313364633637376535386130343039643136636636613964333961 +3231323431356661333264646533343630633437376531640a313961613938343931653861316431 +38313665363931633832613133663438323933663135386262613732303863356464326262356134 +6636633333363938330a343964636437363961613162323635393033616633323838383835616565 +38633030373232636233376132653462613664343436313934653566386332376436376461333063 +35333030353737366530303635643861333166353736353039386130316662376439633239626534 +66636636346232303263656339306135626238633461643832373933343762653839613434636134 +35343939613332653431346664633737373363613537323734323637653466663039313136353734 +32313930376332363733383462656336323635616139366364663236393663316534656466323532 +38346430613236326238313463363262643836643533643331316331653665646430343637303566 +65323536373864623538336336383938363865626432333939353766666433646433663239363830 +65336430666636656335383065356638653534663865383639323562306232353932626166636261 +39353933323365326133623264653332373533396638326330623161666637613936306465666162 +39623136373339383261326366376231666437666362613464326333396332326266323934626232 +31363466653636326362393532666139633039393533343130373864313364326562333238336435 +38333031326231656334323065663366616133623637306434323461393633613064383133653731 +3738 diff --git a/jetstream_common/secret_group_vars/controllers.yml b/jetstream_common/secret_group_vars/controllers.yml index 2312f6fd..d7ad55bb 100644 --- a/jetstream_common/secret_group_vars/controllers.yml +++ b/jetstream_common/secret_group_vars/controllers.yml @@ -1,53 +1,54 @@ $ANSIBLE_VAULT;1.1;AES256 -62393064353039356130653437363630333866386262613034353837393530383164663666613936 -3237386139643131626339663030623436656663653630660a363764326364376432306363613432 -39356266356430663264343561333038626535313738663437326135653032626333393965396130 -3031636164646363610aa313937386634653264663938316139 +33333765383836326465303263333962346233393932623464363462653363346137336431383439 +6330346465613835350adiff --git a/jetstream_common/templates/slurm/slurm.conf.elastic.j2 b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 new file mode 100644 index 00000000..80270c90 --- /dev/null +++ b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 @@ -0,0 +1,56 @@ +## +## This file is maintained by Ansible - CHANGES WILL BE OVERWRITTEN +## +ControlMachine={{ controller_name }} +ControlAddr={{ controller_ip }} +# +AuthType=auth/munge +FastSchedule=1 +JobCompLoc=/var/log/slurm/slurm.job.log +JobCompType=jobcomp/filetxt +PluginDir=/usr/lib64/slurm +SchedulerType=sched/backfill +#SelectType=select/cons_res +#SelectTypeParameters=CR_CPU_Memory +SelectType=select/linear +SlurmUser=slurm +SlurmctldPort=7002 +SlurmctldTimeout=300 +SlurmdPort=7003 +SlurmdSpoolDir=/var/lib/slurm/slurmd/slurmd.spool +SlurmdTimeout=300 +StateSaveLocation=/var/lib/slurm/slurmctld/slurm.state +SwitchType=switch/none +DefaultStorageLoc=/var/log/slurm/slurm_accounting +# AccountingStorageType=accounting_storage/slurmdbd +# AccountingStorageHost=galaxy02.tacc.utexas.edu +#AccountingStoragePort=6819 +# AccountingStoragePort=30001 +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=task=15 +ProctrackType=proctrack/linuxproc +ClusterName={{ slurm_cluster_name }} +ReturnToService=1 +# Elastic config +ResumeRate=1 +ResumeProgram=/opt/slurm_cloud_provision/bin/launch +SuspendProgram=/opt/slurm_cloud_provision/bin/terminate +ResumeTimeout=300 +SuspendTime=180 +SuspendRate=5 +TreeWidth=256 +BatchStartTimeout=780 +# +# Node Configurations +# +NodeName=jetstream-iu-elastic[1-64] State=CLOUD +# +# Partition Configurations +# +# PartitionName=normal Default=YES State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES +PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES + +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmctldDebug=9 +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmdDebug=debug5 diff --git a/jetstream_common/templates/slurm/slurm.conf.j2 b/jetstream_common/templates/slurm/slurm.conf.j2 index 0f335c61..450bdc4f 100644 --- a/jetstream_common/templates/slurm/slurm.conf.j2 +++ b/jetstream_common/templates/slurm/slurm.conf.j2 @@ -22,10 +22,10 @@ SlurmdTimeout=300 StateSaveLocation=/var/lib/slurm/slurmctld/slurm.state SwitchType=switch/none DefaultStorageLoc=/var/log/slurm/slurm_accounting -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost=galaxy02.tacc.utexas.edu +# AccountingStorageType=accounting_storage/slurmdbd +# AccountingStorageHost=galaxy02.tacc.utexas.edu #AccountingStoragePort=6819 -AccountingStoragePort=30001 +# AccountingStoragePort=30001 JobAcctGatherType=jobacct_gather/linux JobAcctGatherFrequency=task=15 ProctrackType=proctrack/linuxproc @@ -35,6 +35,11 @@ ReturnToService=1 # Node Configurations # # CPUs = Sockets * CoresPerSocket * ThreadsPerCore + +# Needed when setting up a controller without workers; plus this will allow +# jobs to be submitted and accepted by Slurm when there are no workers to +# take jobs. +NodeName=placeholder CPUs=64 State=future {% for node_type in slurm_nodes %} {% for host in groups[node_type.inventory_group] %} NodeName={{ hostvars[host]['inventory_hostname_short'] }} NodeAddr={{ hostvars[host]['ansible_host'] }} RealMemory={{ node_type.real_memory }} Sockets={{ node_type.sockets }} CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN @@ -45,7 +50,7 @@ NodeName={{ hostvars[host]['inventory_hostname_short'] }} NodeAddr={{ hostvars[h # {# well this is less than ideal, it should loop partitions, and then generate a hostlist for that partition. but this works for now... #} {% for node_type in slurm_nodes %} -PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes={{ slurm_hostlists.results[0].stdout }} +PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes={{ slurm_hostlists.results[0].stdout | default( 'placeholder') }} {% endfor %} SlurmctldDebug=7 diff --git a/jetstream_common/test_slurm.sh b/jetstream_common/test_slurm.sh new file mode 100644 index 00000000..4a81d472 --- /dev/null +++ b/jetstream_common/test_slurm.sh @@ -0,0 +1,9 @@ +#!/bin/bash +#SBATCH -n 1 +#SBATCH --partition=multi +#SBATCH -J test_job_name +#SBATCH -o /jetstream/scratch0/jobs/%N_%j.out +echo "Running a test job..." +sleep 5 +echo `ls /jetstream/scratch0/jobs/` +echo `date` diff --git a/jetstreamiuenv/README.md b/jetstreamiuenv/README.md new file mode 100644 index 00000000..007b187f --- /dev/null +++ b/jetstreamiuenv/README.md @@ -0,0 +1,162 @@ +Set up a Slurm cluster for use by Galaxy Main on the Jetstream cloud. + +The overarching architecture of this setup is that Galaxy Main uses a Slurm +cluster composed of multiple sub-clusters. Each sub-cluster runs a Slurm control +process and any number of workers to run the jobs. Sub-cluster controllers +also run Pulsar that is used for data staging. + + +Semi-automatic scalable cluster setup +=============================== +This playbook can be used to build and semi-automatically scale a Slurm cluster. +The playbook has been tailored for use with Galaxy Main server and Jetstream +IU region. All the path references are relative to +*[playbook root]/jetstreamiuenv*. + +### Create the controller instance + 1. *Launch a controller instance by hand* + - Create and use a security group (SG) that allows open communication + between instances in the same group (default is *gxy-workers-sg*) + - Create another security group that will allow the master instance to be + accessed from the outside (default is *gxy-sg*) + - The name of the instance needs to match *controller_name* variable in + *group_vars/all.yml* (default is *jetstream-iu-slurm-controller*) + - Image to use is a CentOS 7 (e.g., *736e206d-9c2c-4369-88db-8c3293bd2ad7*) + - Use a public key that is included in *secret_group_vars/controllers.yml* + (otherwise, you’ll be locked out of the instance after the play runs) + - Create a new volume and attach it to the instance (defaults to device + */dev/sdb*) + - (there is a `launch_slurm_controller.py` script to use for the launch) + 2. Make playbook updates + - Update *inventory* to include the controller public IP + - Update *controller_ip* in *group_vars/all.yml* variable to include the + instance private IP + - Update *jetstream_nfs_filesystems* variable in + *group_vars/galaxynodes.yml* to point to the controller’s private IP + - Make sure *slurm-drmaa* is commented out in *group_vars/controllers.yml* + 3. Run the playbook: + - `ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml --ask-vault --limit=controllers` + - The playbook will automatically copy itself onto the controller (to + `/opt/slurm_cloud_provision/`) in preparation for configuring worker nodes + +If integrating with SlurmScale library, you should be all set now; the playbook +installed the library as well. If not using SlurmScale, keep going. + +### Launch worker instance(s) +Worker instances get manually launched and then configured by running this +playbook from the controller instance. + + 1. Manually launch the workers + - Make sure they are in same SG and network as the controller; + - Use key pair named *elastic_kp* (it's contained in the playbook vault) + - Instance names need to have *jetstream-iu-large* prefix (defined in + *group_vars/slurmclients.yml*) and be numbered consecutively starting at 0 + +### Configure the new worker(s) and reconfigure the cluster +These steps are to be performed on the controller instance, all as user *root*. + + 1. Update the playbook's inventory to include the worker(s) info + - Update *galaxynodes* to include worker nodes' private IP addresses + - Set `jetstream-iu0.galaxyproject.org ansible_connection=local` + 2. Run the playbook + - Activate virtualenv from `/opt/slurm_cloud_provision/bin` + - Run the playbook with `ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml` + +After this runs, the cluster should be showing all the worker nodes and be +configured to run jobs in `multi` partition. See section *Verify it works* +below for a sample job script. + + +Elastic scaling cluster setup +============================= +This playbook can be used to build an elastic Slurm cluster that, based on the +job load, acquires and subsequently releases worker instances. The playbook has +been tailored for use with Galaxy Main server and Jetstream IU region. + +### Setup the cloud account +Before the playbook can be run, is is necessary to create a suitable cloud +environment for it. Start by importing the public portion of a key pair +`files/elasticity/elasticity_kp.pub` from this repo into your account. Next, +create a security group called `gxy-workers-sg` with a rule to allow all +communication amongst instances belonging to the same security group. Also +create a public network called `gxy-slurm-net`. Add a subnet and attach it to a +public network via a router. Finally, create an empty volume that will be used +as a NFS-shared cluster file system. Names of all these resources can be found +and changed in `launch.yml`. + +### Launch a controller instance +We'll next launch an instance that will serve as the controller for this cluster. This instance needs to belong to `gxy-workers-sg` security group. You probably +want to create another security group that allows ssh connections and associate the instance +with it as well. You can reuse the `elasticity_kp` key pair but note that by +default, the playbook will override `authorized_keys` file on the instance and +place the set of keys available in `secret_group_vars/controllers.yml` for +`centos`, so make sure your key is included in the file or you will get locked +out of the instance. You can override this setting by defining a key via +`authorized_key_users` variable. Name the instance `jetstream-iu-slurm- +controller`. After launching the instance, associate a public IP address with it +and attach the earlier created volume (as device `/dev/sdb`). + +### Run the playbook +Well, before running the playbook, edit `group_vars/all.yml` and +`group_vars/galaxynodes.yml` to update the IP address of the controller +instance and specify its private IP address. Also, update `inventory` file to include the public IP of the instance under `jetstream-iu0.galaxyproject.org` variable: +``` +jetstream-iu0.galaxyproject.org ansible_ssh_host="149.165.XXX.XXX" ansible_ssh_user="centos" ansible_ssh_private_key_file="kp" ansible_ssh_common_args='-o StrictHostKeyChecking=no -o CheckHostIP=no -o "UserKnownHostsFile /dev/null"' +``` + +Elastic scaling of worker nodes is handled by an Ansible playbook that Slurm calls on (to do so, Slurm calls a script available in `files/slurm/launch`). With that, Ansible needs to know about the credentials for the target cloud; these need to be made available in a a file called `clouds.yaml` in the root directory of this repo. The contents of the file should look like the following: +``` +clouds: + jetstream_iu: + auth: + username: 'username' + password: 'pwd' + project_name: 'project_name' + user_domain_name: 'tacc' + project_domain_name: 'tacc' + auth_url: 'https://jblb.jetstream-cloud.org:35357/v3' +``` + + Then, run the playbook with: +``` +ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml --limit=controllers +``` + +On average, the playbook takes 4-5 minutes to run. + +### Verify it works +To verify the controller instance and the elastic scaling work, ssh to the instance and run a test job. A sample job script is included in the repo as `test_slurm.sh`. If using this job script, first create a directory for its output on the NFS file system: +``` +sudo mkdir /jetstream/scratch0/jobs +sudo chown centos:centos /jetstream/scratch0/jobs +``` +As `centos` user, submit the job script with +``` +sbatch /opt/slurm_cloud_provision/infrastructure-playbook/jetstream_common/test_slurm.sh +``` +In a couple of minutes, job output should be available in `/jetstream/scratch0/jobs`. + +Log file for the Slurm controller process is available in `/var/log/slurm/slurmctld.log` . Logs for the elastic launch/terminate process are available in `/var/log/slurm/launch/`. + +### Elasticity config options +There are some configuration options that can be changed for the cluster elasticity parameters. Instance type to be launched is supplied in `group_vars/all.yml` as `worker_instance_type`. `worker_image_id` can also be updated there. The amount of time Slurm will keep an idle instance around can be defined in `templates/slurm/slurm.conf.elastic.j2` under `SuspendTime` (value is in seconds). For other `slurm.conf` options, see [*slurm.conf* docs](http://slurm.schedmd.com/slurm.conf.html). + +### Issues +Although on the surface this setup appears to work fine, in practice there are three of issues that have not been resolved: + +**1. Zombie processes**: at the end of a cluster scaling operation, 6 ansible +threads will remain active on the controller. These will seemingly never exit as +they await some pid file. Running the scaling script by hand or via *systemd* +did not exhibit this issue. However, w e were never able to figure out what was +causing this or how to resolve it. + +**2. Lingering instances**: when an instance crashes or is terminated by means +other than via Slurm's *SuspendProgram*, it is impossible to remove it from +Slurm's internal instance table. A [post on the Slurm mailing +list](https://groups.google.com/forum/#!topic/slurm-devel/QrVL4_Qc3uA) did not +get any replies. + +**3. Instance acquiring rate**: despite setting Slurm's *ResumeRate* to 1 +[worker node per minute], the number of instance startup requests would exceed +Slurm's ability to manage the requests resulting in *power_save programs getting +backlogged* error messages. diff --git a/jetstreamiuenv/group_vars/all.yml b/jetstreamiuenv/group_vars/all.yml index deed1fb0..e802a7dc 100644 --- a/jetstreamiuenv/group_vars/all.yml +++ b/jetstreamiuenv/group_vars/all.yml @@ -1,5 +1,9 @@ --- +worker_instance_type: m1.large +# CentOS-7-x86_64-GenericCloud-1607 +worker_image_id: 1790e5c8-315a-4b9b-8b1f-46e47330d3cc + all_groups: - name: G-803372 gid: 803372 @@ -22,5 +26,10 @@ all_users: home: /home/g2main shell: /bin/bash -controller_name: jetstream-iu0 -controller_ip: 10.0.0.10 +controller_name: jetstream-iu-slurm-controller +controller_ip: 10.0.0.11 + +jetstream_nfs_filesystems: + - device: "10.0.0.11:/scratch0" + dir: scratch0 + mountpoint: /jetstream/iu-scratch0 diff --git a/jetstreamiuenv/group_vars/controllers.yml b/jetstreamiuenv/group_vars/controllers.yml index c0013be6..d392e84a 100644 --- a/jetstreamiuenv/group_vars/controllers.yml +++ b/jetstreamiuenv/group_vars/controllers.yml @@ -10,7 +10,8 @@ group_packages: - libcurl-devel - nss-devel - openssl-devel - - slurm-drmaa + # FIXME: slurm-drmaa can't be installed on first playbook run because the depot repo is set up by the slurm role + # - slurm-drmaa group_files: - src: files/etc/dhclient.conf diff --git a/jetstreamiuenv/group_vars/galaxynodes.yml b/jetstreamiuenv/group_vars/galaxynodes.yml index ee62a753..889a5500 100644 --- a/jetstreamiuenv/group_vars/galaxynodes.yml +++ b/jetstreamiuenv/group_vars/galaxynodes.yml @@ -31,7 +31,10 @@ links: src: /cvmfs/data.galaxyproject.org/byhand/location force: yes -jetstream_nfs_filesystems: - - device: "10.0.0.10:/scratch0" - dir: scratch0 - mountpoint: /jetstream/iu-scratch0 +# Not being read by Ansible v2.2+ saying variable undefined when calling +# 'Remove static mounts' although the host group `jetstreamnfsclients` is a +# parent of `galaxynodes` server group in the inventory? Moved to all.yml. +# jetstream_nfs_filesystems: +# - device: "10.0.0.11:/scratch0" +# dir: scratch0 +# mountpoint: /jetstream/iu-scratch0 diff --git a/jetstreamiuenv/group_vars/slurmclients.yml b/jetstreamiuenv/group_vars/slurmclients.yml index 7e629634..b20b5ae7 100644 --- a/jetstreamiuenv/group_vars/slurmclients.yml +++ b/jetstreamiuenv/group_vars/slurmclients.yml @@ -13,7 +13,8 @@ group_users: home: /var/lib/slurm shell: /bin/bash -slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch +slurm_yum_repo_baseurl: https://s3.amazonaws.com/gxy-yum/el/$releasever/$basearch +# slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch slurmd_spool_dir: /var/lib/slurm/slurmd/slurmd.spool slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state @@ -21,7 +22,7 @@ slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state slurm_cluster_name: jetstream-iu slurm_nodes: - - real_memory: 29997 + - real_memory: 29996 sockets: 10 partition: large inventory_group: jetstream-iu-large diff --git a/jetstreamiuenv/inventory b/jetstreamiuenv/inventory index 4e3c6af1..c5b9794f 100644 --- a/jetstreamiuenv/inventory +++ b/jetstreamiuenv/inventory @@ -1,6 +1,6 @@ # looking for cvmfs1-iu0? It's in the galaxyenv - -jetstream-iu0.galaxyproject.org +jetstream-iu0.galaxyproject.org ansible_host="149.165.172.156" ansible_user="centos" ansible_ssh_private_key_file="~/.ssh/enis_afgan_galaxy_rsa" ansible_ssh_common_args='-o StrictHostKeyChecking=no -o CheckHostIP=no -o "UserKnownHostsFile /dev/null"' +# jetstream-iu0.galaxyproject.org ansible_connection=local # Uncomment on the controller [baseenv] jetstream-iu0.galaxyproject.org @@ -8,7 +8,7 @@ jetstream-iu0.galaxyproject.org [baseenv:children] galaxynodes -# "contoller" node(s) for this cloud (not necessarily a slurm controller) +# "controller" node(s) for this cloud (not necessarily a slurm controller) [controllers] jetstream-iu0.galaxyproject.org @@ -21,8 +21,8 @@ jetstream-iu0.galaxyproject.org [slurmclients:children] galaxynodes -;[slurmelasticservers] -;jetstream-iu0.galaxyproject.org +[slurmelasticservers] +jetstream-iu0.galaxyproject.org [cvmfsclients] [cvmfsclients:children] @@ -33,7 +33,7 @@ controllers [jetstreamnfsclients:children] galaxynodes -[surmexechosts] +[slurmexechosts] [slurmexechosts:children] galaxynodes @@ -42,11 +42,12 @@ galaxynodes jetstream-iu-large [jetstream-iu-large] -jetstream-iu-large0 ansible_host=10.0.0.20 -jetstream-iu-large1 ansible_host=10.0.0.21 -jetstream-iu-large2 ansible_host=10.0.0.22 -jetstream-iu-large3 ansible_host=10.0.0.23 -jetstream-iu-large4 ansible_host=10.0.0.66 -jetstream-iu-large5 ansible_host=10.0.0.67 -jetstream-iu-large6 ansible_host=10.0.0.68 -jetstream-iu-large7 ansible_host=10.0.0.69 +# jetstream-iu-large0 ansible_host=10.0.0.68 +# jetstream-iu-large1 ansible_host=10.0.0.67 +#jetstream-iu-large1 ansible_host=10.0.0.21 +#jetstream-iu-large2 ansible_host=10.0.0.22 +#jetstream-iu-large3 ansible_host=10.0.0.23 +#jetstream-iu-large4 ansible_host=10.0.0.66 +#jetstream-iu-large5 ansible_host=10.0.0.67 +#jetstream-iu-large6 ansible_host=10.0.0.68 +#jetstream-iu-large7 ansible_host=10.0.0.69 diff --git a/jetstreamiuenv/launch.yml b/jetstreamiuenv/launch.yml new file mode 120000 index 00000000..80cee732 --- /dev/null +++ b/jetstreamiuenv/launch.yml @@ -0,0 +1 @@ +../jetstream_common/launch.yml \ No newline at end of file diff --git a/jetstreamiuenv/terminate.yml b/jetstreamiuenv/terminate.yml new file mode 120000 index 00000000..478a873e --- /dev/null +++ b/jetstreamiuenv/terminate.yml @@ -0,0 +1 @@ +../jetstream_common/terminate.yml \ No newline at end of file diff --git a/jetstreamtaccenv/group_vars/slurmclients.yml b/jetstreamtaccenv/group_vars/slurmclients.yml index c37d8d7f..21891537 100644 --- a/jetstreamtaccenv/group_vars/slurmclients.yml +++ b/jetstreamtaccenv/group_vars/slurmclients.yml @@ -21,7 +21,7 @@ slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state slurm_cluster_name: jetstream-tacc slurm_nodes: - - real_memory: 29997 + - real_memory: 29996 sockets: 10 partition: large inventory_group: jetstream-tacc-large diff --git a/roles/slurm/tasks/elastic.yml b/roles/slurm/tasks/elastic.yml index 1ba85f2d..55df21a3 100644 --- a/roles/slurm/tasks/elastic.yml +++ b/roles/slurm/tasks/elastic.yml @@ -2,13 +2,83 @@ - name: Install virtualenv (yum) yum: - pkg: python-virtualenv + pkg: "{{ item }}" when: ansible_os_family == "RedHat" + with_items: + - python-virtualenv + - ipython - name: Create virtualenv for elastic components pip: name: "{{ item }}" virtualenv: "/opt/slurm_cloud_provision" + state: latest + with_items: + - pip + - setuptools + - cython + +- name: Install specific Ansible version in the venv + pip: + name: ansible + virtualenv: "/opt/slurm_cloud_provision" + version: 2.1.5.0 + +- name: Download Pyslurm + git: + repo: https://github.com/PySlurm/pyslurm + version: 16.05.5 + dest: /opt/pyslurm + +- name: Install Pyslurm + shell: "{{ item }}" + args: + chdir: /opt/pyslurm with_items: - - shade - - ansible + - /opt/slurm_cloud_provision/bin/python setup.py build + - /opt/slurm_cloud_provision/bin/python setup.py install + +- name: Download SlurmScale + git: + repo: https://github.com/afgane/slurmscale + dest: /opt/slurm_cloud_provision/slurmscale + ignore_errors: yes + +- name: Install SlurmScale + shell: /opt/slurm_cloud_provision/bin/python setup.py install + args: + chdir: /opt/slurm_cloud_provision/slurmscale + +- name: Copy SlurmScale config file + copy: + remote_src: True + src: /opt/slurm_cloud_provision/slurmscale/slurmscale.ini.sample + dest: /opt/slurm_cloud_provision/slurmscale/slurmscale.ini + +- name: Copy this playbook + synchronize: # sync is much faster than copy but requires chown task below + src: "{{ playbook_dir }}/../" + dest: /opt/slurm_cloud_provision/infrastructure-playbook/ + +- name: Place vault pass + copy: + src: "../jetstream_common/files/elasticity/vp" + dest: /root/.vault_pass + mode: "0600" + +- name: Place CloudBridge config + copy: + src: "../jetstream_common/files/elasticity/cloudbridge" + dest: /root/.cloudbridge + mode: "0600" + +- name: Ensure root's .ssh dir exists + file: + path: /root/.ssh + state: directory + +- name: Place PK + copy: + src: "../jetstream_common/files/elasticity/elasticity_kp" + dest: /root/.ssh/id_rsa + mode: "0600" diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index 25bec879..9cb9bad6 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -38,13 +38,17 @@ #- slurm-torque when: ansible_os_family == "RedHat" -# FIXME: this task will fail if slurmservers[0] has not already completed the slurm.conf task that follows it +- name: Check if any workers are defined locally + debug: + msg: '{{ groups[item.inventory_group] | length > 0 }}' + with_items: "{{ slurm_nodes }}" + register: local_workers + - name: Acquire hostlist command: scontrol show hostlist {{ groups[item.inventory_group] | join(",") }} - with_items: slurm_nodes - delegate_to: "{{ groups['slurmservers'][0] }}" - run_once: true + with_items: "{{slurm_nodes}}" register: slurm_hostlists + when: local_workers.results[0].msg == True and 'slurmexechosts' not in group_names - name: Install slurm.conf template: @@ -96,7 +100,7 @@ group: slurm mode: 0755 state: directory - when: "'slurmservers' in group_names" + when: "'slurmservers' in group_names or 'slurmexechosts' in group_names" - name: Check munge dir file: