diff --git a/ansible.cfg b/ansible.cfg index 9af49e2c..f5b7f9a7 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -1,4 +1,6 @@ [defaults] +host_key_checking = False +callback_whitelist = profile_tasks # include roles from galaxyproject/ansible-common-roles roles_path = roles:common_roles @@ -17,3 +19,6 @@ transport = ssh # These are necessary for cloud instances #pipelining = False #ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no + +[paramiko_connection] +record_host_keys = False diff --git a/jetstream_common/files/elasticity/cloudbridge b/jetstream_common/files/elasticity/cloudbridge new file mode 100644 index 00000000..1bb4aec6 --- /dev/null +++ b/jetstream_common/files/elasticity/cloudbridge @@ -0,0 +1,15 @@ +$ANSIBLE_VAULT;1.1;AES256 +30613566326532613235643863643061666535326538303961656161383562613538666335353338 +3732646138623733353131626236616662303661626266620a373630623662346430623734656131 +66363662643339383161663664396338633130383566323936653861363830393833653939623563 +3431316239313237610a303866396566656638613136626438373364326235346639643466323937 +66623562353539363864646639613036306335316138616332656263393564616463323839306531 +66636465626539366661343261633530633337653030633864663135323033626333316166366535 +65326137623864643265393465383139396538363835356232303162383935356139333738343230 +34363136343137646566653635376663343939316436613762313337353936656236366162383232 +39663239396633373066323638666135623964393361666539303032343539383335313936333739 +63313266346562646433303238303232386435643562343164346533323265313136653263393734 +66333636386336623265333263636132323331326663396436326230636236316136323265383236 +32646165386631353538333365383964646163623631333762333961343733393135396563313363 +63663334333163663735323064353166343930343639383930393061623966656665346636613665 +3332336361646631316431653635666536336538633936663365 diff --git a/jetstream_common/files/elasticity/elasticity_kp b/jetstream_common/files/elasticity/elasticity_kp new file mode 100644 index 00000000..1620a97e --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp @@ -0,0 +1,90 @@ +$ANSIBLE_VAULT;1.1;AES256 +64646363613565643537363537623663373961326364366636343061323666396235326430623538 +6263373731333466383561386139653035313362326237660a366639333261643337643332666164 +32326564623561663435666162343930396634656130313039363930353664666336323839356339 +3566623532316233630a333936353261643865633236323930363132613433366236663638666535 +65376430393663326237626239373365363263316664353163653664363932383134633539303764 +38363132616265633134656362616466396635383534336133333333613566666638303936643566 +65393431656337323965623564613366346130663233663761306662373466313364363366393561 +32393964643664386132313139366630326535323137636235393234383861376632613339613961 +39376231303465383131333035383262383032333031303963616336363738616435313534363561 +62336532633861303438396263663339653436653566633436353232326532633237343566346338 +61633837623235653465623635363338643662616265393935373532653237616366613637633631 +35636237633433623863393766396430386264623631656336663661633037306262376666386130 +37343938393766643534333662633436386635343262333263343236356639333035396138656166 +63623334373930356139653664623633316437616233366339623039386633373438386530373531 +33633265353030376162363938333038343432626565363463643137353635613933313433383561 +33623938623530643062656266336630366131383839636562636364303364323366653837663935 +39366532353635323137353631383132323435616636366636653965343637616164653236323662 +38643537643262653032393963303861326239303064396363376262346439346264356435316561 +39666461303234663136363264343962356539333632343336646637633533303638373536313064 +32353532396162356535353634333236646162633463616335653665333833323331396264346362 +36633332396164316663353439343737316631376666633862366232353535623135303232323838 +36373164303062613030336363306336343762313534626265663064633632303131373730333532 +36343439646238303531633862363361663431623934396566393733353838666564316162623730 +66313263313037376635393937333266636561636634663136623661626530316164633536646339 +38653937383866303065313637366138623933336236636138383236333439616463343863623139 +66396336353439656237313336636462366364653038316662353465336130383433633363313633 +62353264373835393265323131363137353139313833373335363366373636346639313538653436 +63306535633665303735323732373266663864326666666239393134636530323061636466393530 +32366630376661613637626362396434623332613561383838356264623863383561303564333936 +30383834303661366536313038323864383438396438373661656236363566366334343237306665 +65343537626639386161373830643866336639383765353363363465643732333366623562356262 +30636535306630373165363639316363643835313765313562343835396461333733613130383865 +30313866666633373432626361393633616534356132663165356332326635336438316463303035 +62653439663534393336363635613163383630363565366562616331646164313139653333376633 +34333062653436383463346561343934333865386436666265366537613464613530383532323231 +35363638343466656630333334633432653164393936346139633233646334653631386131356537 +37616165353965633333353031666166363737363366616261373532666136363330353263623362 +35636636306431316336623233383966633939636264653432643865393463383130386134323736 +63623663623230346266343839386132303631343036333334393135643464393837326234623238 +33306330313237386362623265376232343630653535313532376563373031616635326635653336 +63303232623166663866313539333337323839643934366363333733636437623163366233383062 +34306133373238333963626333363830626261333235376465373661643532336530623431313731 +61383634376437373135353831376364306639646565346234386265656333396337646233643937 +65343336356531633534353033663135666136653565303764346264326437643331643331623939 +37313336616438323538623765303635303637313263636131353965316434653837306466393836 +61373464306133613164383236333230363139643061623338363965333230613963356230313939 +37396131326634613164393231636635303633373461376134323630383336323961663137636264 +62643463643132363634353164383232613866356630366536393039653465366230636234386566 +39393032396462663634353739336664646561396361633431336638316266303837313633363263 +37656235623462646166643132326462373930656465646436346333343234363339656136373431 +33306466613861633632643063656266313762333931383335636137386165643237396564356635 +61383934386239616638623333626532373966623163373466326466643964363733666134656562 +35653963613064383336363338323766646436663932376131653664313964366139313130643733 +61326131613336356430346432356166653466333534333136356533613939376333613237616362 +64626638396263356138343132333633613261386430313932653536303565663535336631333830 +36653436616539363065383335366435626138633037383832366434316530613938383138303964 +38306334346638376639313763383934633931656530393761373833396565333566656130356662 +30636431663365616532383764386338356137643736356431393536346635313265616338633630 +61386432343630333734353834316664326431363335626365663335346239313633363135643563 +64646135363630396630616161326334393439613431636231353831613061323963353434643136 +37323865343361306165343534323831613234383432303632333039643835336664396564306161 +62366331653061363931313634333131366531613464383065316362393964323030643432623666 +30386634646633376236643062393032643265376536303263313562653165643531613066396363 +38666138316532626434303161316262646230373164656639306264303430323461643536653033 +38333034356464336538653130323036646235376437346436383339333963633438343965636132 +33316465356634343633363131663533336139343134323265633061326431653137343261653232 +34666162626262373032316633643339376334613836656464623236653938343535623962396539 +36653034343930393163663431393732306264316662316230663564336361653665356566616165 +62646530623561613466303362383562663065346533306338633065613437666236623161326530 +66656535356635646661313966386437653366613661656138366664373162336333373932613765 +30323861326531333735626537393036633931343962663031663561333336333634333938323134 +31303630306137626130323731616432353635343365333666383639623064376264346536353738 +61656337313632383564336439363536636430396335653234376662393465653832363436366635 +65373536376631353633336230353236343666366636666236386264343136623963376539323431 +38656332353933623631316338393939313766636537333564346234613238613065626566613366 +63646533326136326462623439313232616534646139373961386439616563333164656565626137 +37653538636430663265356364363763626532386262353861666266323330303965323939336465 +33303931353830333565666163373766353265653464616462623165616335663764316237356335 +31393165323331356136306662393462336562353635323734313963626337393662323932666361 +30656664353465623031623631343264613662316266363364373634313966643839303136386564 +62643862326566383333326462646131626461316238383561653531333062663333653134326463 +33313566653963393330363337303435353531656461663436663161336631326533633834346536 +39323464363930636631323163653333623565613134393366656336633664613563623339366531 +61353436623466316664373135623334396432626163353065646536613331363865393436333339 +35383132383731373539373632366633643063636664316630383935376262636561376338336639 +62326636393664373736393635663833663530376633633431613765313430393162306431363463 +64383435386262363764663434633264333666383335356361643836383766353933643930336132 +66323139363630343463623765626138666531613431343733383331366132323234396237376535 +3562 diff --git a/jetstream_common/files/elasticity/elasticity_kp.pub b/jetstream_common/files/elasticity/elasticity_kp.pub new file mode 100644 index 00000000..92b4fb7d --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpw4D9NR0at80ehc84kTchP2aDCV9HbZetptVCwc2/sdzUHvecUXNRc61RYKttF3kdgWO8UhGWjUUjNCMOR/gYxV1Ui1hpihFExdg2tHbXpkArdrHJK6n+QWf7qYrNTLFVFH2XVxouzJ4k37slGNeQMWczRHS2ZUL+cbfYOEWe8RhWRqhRjvAdxLnQt1dg/sKc0MqeDwEVAmdgNHUozbeUxKVVsoxWBeUiLK7xTd0PQ/jXRVWY1pYQ2xfjvhm1DiylQO/5fox8Z5MmyqBSAsKdGgfe47K335QmmrEKtq/6O1AAnm6D0Pkqky3EgzT7g/DpT2n8VyCWAdFIvrQAg65V Generated-by-Nova diff --git a/jetstream_common/files/elasticity/vp b/jetstream_common/files/elasticity/vp new file mode 100644 index 00000000..b82b0bc4 --- /dev/null +++ b/jetstream_common/files/elasticity/vp @@ -0,0 +1,7 @@ +$ANSIBLE_VAULT;1.1;AES256 +39343936396638626438353764366237336637336537383966393234303534393730303635653564 +3837353262653931633761336561623666653464303032630a323933393438376438663332666238 +34333133326533313136363935373837376538636230653239353430356330633037393766373763 +3031376537643338370a333939656563646539356563306666306131303638666666633036383161 +34636336623039376638633430353138363031366230633833633234616238613165303534336464 +3634326232303961313037333636646562393732666334653033 diff --git a/jetstream_common/files/slurm/launch b/jetstream_common/files/slurm/launch index 68f7f223..5c73d26e 100755 --- a/jetstream_common/files/slurm/launch +++ b/jetstream_common/files/slurm/launch @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,5 +17,5 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" ) 2>&1 | tee "$LOG_DIR/launch.`date +%s`.${1}.log" diff --git a/jetstream_common/files/slurm/terminate b/jetstream_common/files/slurm/terminate index 1bb954c1..4b78dc79 100755 --- a/jetstream_common/files/slurm/terminate +++ b/jetstream_common/files/slurm/terminate @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,6 +17,6 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" ) 2>&1 | tee "$LOG_DIR/terminate`date +%s`.${1}.log" true diff --git a/jetstream_common/launch.yml b/jetstream_common/launch.yml index ad990ec2..e128d4b4 100644 --- a/jetstream_common/launch.yml +++ b/jetstream_common/launch.yml @@ -8,14 +8,14 @@ tasks: - name: Launch new instance(s) os_server: - cloud: jetstream_iu + cloud: jetstream_iu # Details defined in clouds.yaml, including auth name: "{{ item }}" - image: "CentOS 7 Stock 1601" - flavor: "m1.large" - key_name: "slurm_jetstream0" - security_groups: "default,usegalaxy-control" + image: "{{ worker_image_id }}" + flavor: "{{ worker_instance_type }}" + key_name: "elasticity_kp" + security_groups: "gxy-workers-sg" nics: - - net-name: "usegalaxy" + - net-name: "gxy-slurm-net" auto_ip: no with_items: "{{ jetstream_instances_to_launch.split(',') }}" register: jetstream_instances_launched @@ -29,7 +29,8 @@ add_host: name: "{{ item.server.name }}" ansible_host: "{{ item.server.private_v4 }}" - groups: "baseenv,galaxynodes,slurmclients,slurmexechosts" + groups: + "baseenv,galaxynodes,slurmclients,slurmexechosts,jetstreamnfsclients" with_items: "{{ jetstream_instances_launched.results }}" - name: Spin waiting for instance(s) to become accessible diff --git a/jetstream_common/playbook.yml b/jetstream_common/playbook.yml index 5ee720e7..110a57c7 100644 --- a/jetstream_common/playbook.yml +++ b/jetstream_common/playbook.yml @@ -5,6 +5,7 @@ remote_user: centos become: yes become_method: sudo + connection: paramiko pre_tasks: - name: Locate secret group variable files local_action: @@ -65,7 +66,7 @@ dest: /etc/hosts - name: Install supervisor - hosts: all + hosts: controllers remote_user: centos become: yes become_method: sudo diff --git a/jetstream_common/secret_group_vars/clouds.yaml b/jetstream_common/secret_group_vars/clouds.yaml new file mode 100644 index 00000000..b4840d3a --- /dev/null +++ b/jetstream_common/secret_group_vars/clouds.yaml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +39376264656237656561343363313364633637376535386130343039643136636636613964333961 +3231323431356661333264646533343630633437376531640a313961613938343931653861316431 +38313665363931633832613133663438323933663135386262613732303863356464326262356134 +6636633333363938330a343964636437363961613162323635393033616633323838383835616565 +38633030373232636233376132653462613664343436313934653566386332376436376461333063 +35333030353737366530303635643861333166353736353039386130316662376439633239626534 +66636636346232303263656339306135626238633461643832373933343762653839613434636134 +35343939613332653431346664633737373363613537323734323637653466663039313136353734 +32313930376332363733383462656336323635616139366364663236393663316534656466323532 +38346430613236326238313463363262643836643533643331316331653665646430343637303566 +65323536373864623538336336383938363865626432333939353766666433646433663239363830 +65336430666636656335383065356638653534663865383639323562306232353932626166636261 +39353933323365326133623264653332373533396638326330623161666637613936306465666162 +39623136373339383261326366376231666437666362613464326333396332326266323934626232 +31363466653636326362393532666139633039393533343130373864313364326562333238336435 +38333031326231656334323065663366616133623637306434323461393633613064383133653731 +3738 diff --git a/jetstream_common/secret_group_vars/controllers.yml b/jetstream_common/secret_group_vars/controllers.yml index 2312f6fd..d7ad55bb 100644 --- a/jetstream_common/secret_group_vars/controllers.yml +++ b/jetstream_common/secret_group_vars/controllers.yml @@ -1,53 +1,54 @@ $ANSIBLE_VAULT;1.1;AES256 -62393064353039356130653437363630333866386262613034353837393530383164663666613936 -3237386139643131626339663030623436656663653630660a363764326364376432306363613432 -39356266356430663264343561333038626535313738663437326135653032626333393965396130 -3031636164646363610a633533353335396239656164346235366165363161646565336433353438 -65326339616438613839613735336535356163393639303531366230323566353362623664626165 -37363666633461396534346166373662316565386565656634633037643765643630386533393333 -61646163646235363362306138306565366438623431663735313938373438316633333637313136 -62363566666362356262326530393531326336326532343437333861326531396531393331633635 -61626135663065363264333034386632653930643339316235326462363438323464373337393437 -34383734353438373563663261366564306336323564313335653737636430393032613235663136 -34343336303135346338306232633263633532333739653733343036316330633165346639323936 -61366666366438396335656631643837333966303266303036393865366138343339653431616634 -39646130646539346232323533373437396238643232376137633936613832333535383862656166 -62656233653533363337623164373334623136346639643330333836386538353363366461633136 -35613234623865646163313937346266326137333465616466393133626135323562623934636565 -65333231363964323730656561366634313762633432633962623634623836363436386663383339 -36303336323166653631633731623136633764663331366531626131656363646337393266323662 -63313836316666653061373761653032366165393064363832343237373039623564626530323530 -39366462666336343363613330653937353063313765306435353761626530393936646461636332 -32356131373937333964616139313962623635346531346133316661303262396631613538633135 -38383061353135313163643565616464333565616132326265636663363461346530653361373362 -31333836373563653331653461333533383066666463613132373561643164633032396436336166 -63373963363663323361386239323361376235616162623330613939333130613432313965303466 -64333431656239653765326365346534363737643439336230616661346466353635353662303735 -30336535636535373834366631623334363038383434303661366638356462303363313236303434 -62353835373634613531306463373632333839393038346232353063333661336131326437353835 -38653339373830646637626336613136616138356562636137623932373038373638316439386136 -37366432616239376162316263373633663266316438643036633133396639333035363138666261 -65336332663965393561393136646636333031316639343830653333633162316265323361656462 -34336265633666306361343130633437626235303464633539373561336439303835393136326337 -62393334333363353663343636646664333439343930396166363236323438366464383931346439 -65616233313366316234666164633366313962643061363235386562646532363738313465613032 -65663966643936663831663366336662363534353531343835663265623434633836383832383466 -37343732306436303131666337366231636234333064616536306661383336393830386132336261 -32336634333532313939343233336239393731636330326231613334663731663535363032373166 -34366264306334616537336532393531643333393165353563383866373436323831363230346539 -39656464373264323734663161643033393466626338633837393837343062656332663037316131 -65373831356365623162343366343363336631663335353335396164623634353762316631313235 -32356130363337393331383531376431653137663235633033313661313965633161623530616466 -36326462363431333431303437663136653661386131383466333334343262663666386139363832 -61353933653333616130343361336130373136373466396431363061353264303061626361396466 -30313061393432303332383239363162333932376434363330363530353336623464656131643533 -65353363396661656464636338366263363235343536656565663936663037323735363933663266 -64363131343764643832393130346363386537633431623663343761356366376264333134383438 -30616666393363646438326136303235346234373266366139393136336665613265346133303836 -65343263343734353165333066393164323535346564656264383066663566306162306335643764 -66396236313135633535333231313965303866326362336536383232386661366239373030333762 -30663536656462663764323037303637663864383735306137646431353965366661663464613064 -33356232623731353964393933623235386565653436333636333036636165333039613835336538 -30323464643164333864346538333762653838393466393439393836373933623237613033626233 -33616438373131306630326431396665373734613531346633306437616230393138306665306130 -36303635616265653935 +34366663643338623731303138343366373164316639663931643233386434356163633637636434 +3131663331313366356161633838666162623536376438610a313937386634653264663938316139 +33333765383836326465303263333962346233393932623464363462653363346137336431383439 +6330346465613835350a343738373238353037653363323762666138363332636266633436303361 +33383130386233333131636266313333303366383439356462303063656661643835333239613862 +65343733343735663936393236653264353933336430343930646235333564316561643364353066 +61393838633131366333373236626337613165383062633730323861323238633764366433363837 +30396135346262336162386432333565386437326235643161396434666238326435666461656666 +65313365336563666231383638306163323931363962393564363964333132626636396363643333 +39336366313032313630386464356433396263333163666264323834633066346265663735613761 +62316639363438383232343661636664666639376431343138633539653033333965633438613535 +62663230316637306366633837643930323862393963616430363137396531396165626335393537 +61633365653864653138366139656163373532363836326261306265376631613438653963626239 +66656430336361373335333133346432353937613638313434373134353836336634646566666365 +39663764643331626232343865306138383537393736666636333136356534396561306239636333 +36646430303964336263656432323736353437343837373931656464333362636361356264636163 +33356161373263336533356138306630363666366339636339373064333062326661656132373738 +39333930393532306638306562616135303965356136616362616263393164363639663335663364 +37393466306237383132616334373430386330336665656263616264333336663737386639353930 +66306534353336316634626638393665356563613765363062393866376461316433653866323833 +30386465666130643562626333333139303466343865393830356663303733653631643336323231 +62636536653238613563643336343430366239343033336637373066376136376233306532646636 +62363061376634333338616263383532353233363866356233363461333830613134386336316632 +61643838323936663061366363356566356639373437663338376234303837646137646163653833 +30633033343031313432656361363830343663306361623931373736633766333764316265303833 +64303839623832643536336531633262636564343039363335313965323339323964373938333039 +65363963326664633334353233666464396566356435346465633034373033666563316535666135 +65333965663737333933323263373637646138383635316230373664356664363932366535623038 +33386661366534613066356637643764393163633035613334396237343838313065333730636134 +39393135383931333533343632653632613737626265643637666130306563323430643736613839 +39333663323265633466386535623166383331326337376131303730636466623636626262623839 +34313730623033363465303833613537373234626362666363336664353736383938633338613666 +66366632323433353236653264623863356532323862643931353238396439363835316639346666 +33353230666339366530323061386636643132363064666537656130333265653830373532353561 +63386531666532636333393333343537626130656237373339613261623563383331333434343336 +36396161323232376132363736316162646366316666396239626462663034663134613661383637 +39616264636664323563356537633566613939653831353265323137333565353630653631366234 +39616131643732306432313865376261343836386334613835326164326232343632393432613632 +62316633373832383465343932666135346237626431636632363830343962313034613365353537 +36646631323862376636383532373536383431366361343163373861656636386634616135356663 +32346332316637663535376136636138376563663131323031373231393162306333333066363934 +34333831303134376233643935373662353930383465373761646136613666663965653964616237 +38363364303532336638396462396435653565393932316361303131346536653139323631633733 +35613861393533346366636565396533316535633035363038366432626361653735393036303132 +35343635393634373362363337376136633664343432646634323563333863613433393835313334 +62323831373462666461346365613061326230666538373962376130356535363634363865396332 +34616334636663663639633966383831323361383532623239633930633065363534393063363430 +62353631313139356364626666373739383336613234373433663464623339663765623366646337 +38383934616434356465386239336233336266303961346466373035346433343063323130316530 +34343361326532356366656334303836656263643734643531633837633938313636303533333434 +37363834613666303831636332343263386138383866376663336639616232333761313465313430 +32616135373031356261333265373936666562373439636261663339653832373163633039343162 +6432 diff --git a/jetstream_common/templates/slurm/slurm.conf.elastic.j2 b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 new file mode 100644 index 00000000..80270c90 --- /dev/null +++ b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 @@ -0,0 +1,56 @@ +## +## This file is maintained by Ansible - CHANGES WILL BE OVERWRITTEN +## +ControlMachine={{ controller_name }} +ControlAddr={{ controller_ip }} +# +AuthType=auth/munge +FastSchedule=1 +JobCompLoc=/var/log/slurm/slurm.job.log +JobCompType=jobcomp/filetxt +PluginDir=/usr/lib64/slurm +SchedulerType=sched/backfill +#SelectType=select/cons_res +#SelectTypeParameters=CR_CPU_Memory +SelectType=select/linear +SlurmUser=slurm +SlurmctldPort=7002 +SlurmctldTimeout=300 +SlurmdPort=7003 +SlurmdSpoolDir=/var/lib/slurm/slurmd/slurmd.spool +SlurmdTimeout=300 +StateSaveLocation=/var/lib/slurm/slurmctld/slurm.state +SwitchType=switch/none +DefaultStorageLoc=/var/log/slurm/slurm_accounting +# AccountingStorageType=accounting_storage/slurmdbd +# AccountingStorageHost=galaxy02.tacc.utexas.edu +#AccountingStoragePort=6819 +# AccountingStoragePort=30001 +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=task=15 +ProctrackType=proctrack/linuxproc +ClusterName={{ slurm_cluster_name }} +ReturnToService=1 +# Elastic config +ResumeRate=1 +ResumeProgram=/opt/slurm_cloud_provision/bin/launch +SuspendProgram=/opt/slurm_cloud_provision/bin/terminate +ResumeTimeout=300 +SuspendTime=180 +SuspendRate=5 +TreeWidth=256 +BatchStartTimeout=780 +# +# Node Configurations +# +NodeName=jetstream-iu-elastic[1-64] State=CLOUD +# +# Partition Configurations +# +# PartitionName=normal Default=YES State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES +PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES + +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmctldDebug=9 +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmdDebug=debug5 diff --git a/jetstream_common/templates/slurm/slurm.conf.j2 b/jetstream_common/templates/slurm/slurm.conf.j2 index 0f335c61..450bdc4f 100644 --- a/jetstream_common/templates/slurm/slurm.conf.j2 +++ b/jetstream_common/templates/slurm/slurm.conf.j2 @@ -22,10 +22,10 @@ SlurmdTimeout=300 StateSaveLocation=/var/lib/slurm/slurmctld/slurm.state SwitchType=switch/none DefaultStorageLoc=/var/log/slurm/slurm_accounting -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost=galaxy02.tacc.utexas.edu +# AccountingStorageType=accounting_storage/slurmdbd +# AccountingStorageHost=galaxy02.tacc.utexas.edu #AccountingStoragePort=6819 -AccountingStoragePort=30001 +# AccountingStoragePort=30001 JobAcctGatherType=jobacct_gather/linux JobAcctGatherFrequency=task=15 ProctrackType=proctrack/linuxproc @@ -35,6 +35,11 @@ ReturnToService=1 # Node Configurations # # CPUs = Sockets * CoresPerSocket * ThreadsPerCore + +# Needed when setting up a controller without workers; plus this will allow +# jobs to be submitted and accepted by Slurm when there are no workers to +# take jobs. +NodeName=placeholder CPUs=64 State=future {% for node_type in slurm_nodes %} {% for host in groups[node_type.inventory_group] %} NodeName={{ hostvars[host]['inventory_hostname_short'] }} NodeAddr={{ hostvars[host]['ansible_host'] }} RealMemory={{ node_type.real_memory }} Sockets={{ node_type.sockets }} CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN @@ -45,7 +50,7 @@ NodeName={{ hostvars[host]['inventory_hostname_short'] }} NodeAddr={{ hostvars[h # {# well this is less than ideal, it should loop partitions, and then generate a hostlist for that partition. but this works for now... #} {% for node_type in slurm_nodes %} -PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes={{ slurm_hostlists.results[0].stdout }} +PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes={{ slurm_hostlists.results[0].stdout | default( 'placeholder') }} {% endfor %} SlurmctldDebug=7 diff --git a/jetstream_common/test_slurm.sh b/jetstream_common/test_slurm.sh new file mode 100644 index 00000000..4a81d472 --- /dev/null +++ b/jetstream_common/test_slurm.sh @@ -0,0 +1,9 @@ +#!/bin/bash +#SBATCH -n 1 +#SBATCH --partition=multi +#SBATCH -J test_job_name +#SBATCH -o /jetstream/scratch0/jobs/%N_%j.out +echo "Running a test job..." +sleep 5 +echo `ls /jetstream/scratch0/jobs/` +echo `date` diff --git a/jetstreamiuenv/README.md b/jetstreamiuenv/README.md new file mode 100644 index 00000000..007b187f --- /dev/null +++ b/jetstreamiuenv/README.md @@ -0,0 +1,162 @@ +Set up a Slurm cluster for use by Galaxy Main on the Jetstream cloud. + +The overarching architecture of this setup is that Galaxy Main uses a Slurm +cluster composed of multiple sub-clusters. Each sub-cluster runs a Slurm control +process and any number of workers to run the jobs. Sub-cluster controllers +also run Pulsar that is used for data staging. + + +Semi-automatic scalable cluster setup +=============================== +This playbook can be used to build and semi-automatically scale a Slurm cluster. +The playbook has been tailored for use with Galaxy Main server and Jetstream +IU region. All the path references are relative to +*[playbook root]/jetstreamiuenv*. + +### Create the controller instance + 1. *Launch a controller instance by hand* + - Create and use a security group (SG) that allows open communication + between instances in the same group (default is *gxy-workers-sg*) + - Create another security group that will allow the master instance to be + accessed from the outside (default is *gxy-sg*) + - The name of the instance needs to match *controller_name* variable in + *group_vars/all.yml* (default is *jetstream-iu-slurm-controller*) + - Image to use is a CentOS 7 (e.g., *736e206d-9c2c-4369-88db-8c3293bd2ad7*) + - Use a public key that is included in *secret_group_vars/controllers.yml* + (otherwise, you’ll be locked out of the instance after the play runs) + - Create a new volume and attach it to the instance (defaults to device + */dev/sdb*) + - (there is a `launch_slurm_controller.py` script to use for the launch) + 2. Make playbook updates + - Update *inventory* to include the controller public IP + - Update *controller_ip* in *group_vars/all.yml* variable to include the + instance private IP + - Update *jetstream_nfs_filesystems* variable in + *group_vars/galaxynodes.yml* to point to the controller’s private IP + - Make sure *slurm-drmaa* is commented out in *group_vars/controllers.yml* + 3. Run the playbook: + - `ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml --ask-vault --limit=controllers` + - The playbook will automatically copy itself onto the controller (to + `/opt/slurm_cloud_provision/`) in preparation for configuring worker nodes + +If integrating with SlurmScale library, you should be all set now; the playbook +installed the library as well. If not using SlurmScale, keep going. + +### Launch worker instance(s) +Worker instances get manually launched and then configured by running this +playbook from the controller instance. + + 1. Manually launch the workers + - Make sure they are in same SG and network as the controller; + - Use key pair named *elastic_kp* (it's contained in the playbook vault) + - Instance names need to have *jetstream-iu-large* prefix (defined in + *group_vars/slurmclients.yml*) and be numbered consecutively starting at 0 + +### Configure the new worker(s) and reconfigure the cluster +These steps are to be performed on the controller instance, all as user *root*. + + 1. Update the playbook's inventory to include the worker(s) info + - Update *galaxynodes* to include worker nodes' private IP addresses + - Set `jetstream-iu0.galaxyproject.org ansible_connection=local` + 2. Run the playbook + - Activate virtualenv from `/opt/slurm_cloud_provision/bin` + - Run the playbook with `ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml` + +After this runs, the cluster should be showing all the worker nodes and be +configured to run jobs in `multi` partition. See section *Verify it works* +below for a sample job script. + + +Elastic scaling cluster setup +============================= +This playbook can be used to build an elastic Slurm cluster that, based on the +job load, acquires and subsequently releases worker instances. The playbook has +been tailored for use with Galaxy Main server and Jetstream IU region. + +### Setup the cloud account +Before the playbook can be run, is is necessary to create a suitable cloud +environment for it. Start by importing the public portion of a key pair +`files/elasticity/elasticity_kp.pub` from this repo into your account. Next, +create a security group called `gxy-workers-sg` with a rule to allow all +communication amongst instances belonging to the same security group. Also +create a public network called `gxy-slurm-net`. Add a subnet and attach it to a +public network via a router. Finally, create an empty volume that will be used +as a NFS-shared cluster file system. Names of all these resources can be found +and changed in `launch.yml`. + +### Launch a controller instance +We'll next launch an instance that will serve as the controller for this cluster. This instance needs to belong to `gxy-workers-sg` security group. You probably +want to create another security group that allows ssh connections and associate the instance +with it as well. You can reuse the `elasticity_kp` key pair but note that by +default, the playbook will override `authorized_keys` file on the instance and +place the set of keys available in `secret_group_vars/controllers.yml` for +`centos`, so make sure your key is included in the file or you will get locked +out of the instance. You can override this setting by defining a key via +`authorized_key_users` variable. Name the instance `jetstream-iu-slurm- +controller`. After launching the instance, associate a public IP address with it +and attach the earlier created volume (as device `/dev/sdb`). + +### Run the playbook +Well, before running the playbook, edit `group_vars/all.yml` and +`group_vars/galaxynodes.yml` to update the IP address of the controller +instance and specify its private IP address. Also, update `inventory` file to include the public IP of the instance under `jetstream-iu0.galaxyproject.org` variable: +``` +jetstream-iu0.galaxyproject.org ansible_ssh_host="149.165.XXX.XXX" ansible_ssh_user="centos" ansible_ssh_private_key_file="kp" ansible_ssh_common_args='-o StrictHostKeyChecking=no -o CheckHostIP=no -o "UserKnownHostsFile /dev/null"' +``` + +Elastic scaling of worker nodes is handled by an Ansible playbook that Slurm calls on (to do so, Slurm calls a script available in `files/slurm/launch`). With that, Ansible needs to know about the credentials for the target cloud; these need to be made available in a a file called `clouds.yaml` in the root directory of this repo. The contents of the file should look like the following: +``` +clouds: + jetstream_iu: + auth: + username: 'username' + password: 'pwd' + project_name: 'project_name' + user_domain_name: 'tacc' + project_domain_name: 'tacc' + auth_url: 'https://jblb.jetstream-cloud.org:35357/v3' +``` + + Then, run the playbook with: +``` +ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml --limit=controllers +``` + +On average, the playbook takes 4-5 minutes to run. + +### Verify it works +To verify the controller instance and the elastic scaling work, ssh to the instance and run a test job. A sample job script is included in the repo as `test_slurm.sh`. If using this job script, first create a directory for its output on the NFS file system: +``` +sudo mkdir /jetstream/scratch0/jobs +sudo chown centos:centos /jetstream/scratch0/jobs +``` +As `centos` user, submit the job script with +``` +sbatch /opt/slurm_cloud_provision/infrastructure-playbook/jetstream_common/test_slurm.sh +``` +In a couple of minutes, job output should be available in `/jetstream/scratch0/jobs`. + +Log file for the Slurm controller process is available in `/var/log/slurm/slurmctld.log` . Logs for the elastic launch/terminate process are available in `/var/log/slurm/launch/`. + +### Elasticity config options +There are some configuration options that can be changed for the cluster elasticity parameters. Instance type to be launched is supplied in `group_vars/all.yml` as `worker_instance_type`. `worker_image_id` can also be updated there. The amount of time Slurm will keep an idle instance around can be defined in `templates/slurm/slurm.conf.elastic.j2` under `SuspendTime` (value is in seconds). For other `slurm.conf` options, see [*slurm.conf* docs](http://slurm.schedmd.com/slurm.conf.html). + +### Issues +Although on the surface this setup appears to work fine, in practice there are three of issues that have not been resolved: + +**1. Zombie processes**: at the end of a cluster scaling operation, 6 ansible +threads will remain active on the controller. These will seemingly never exit as +they await some pid file. Running the scaling script by hand or via *systemd* +did not exhibit this issue. However, w e were never able to figure out what was +causing this or how to resolve it. + +**2. Lingering instances**: when an instance crashes or is terminated by means +other than via Slurm's *SuspendProgram*, it is impossible to remove it from +Slurm's internal instance table. A [post on the Slurm mailing +list](https://groups.google.com/forum/#!topic/slurm-devel/QrVL4_Qc3uA) did not +get any replies. + +**3. Instance acquiring rate**: despite setting Slurm's *ResumeRate* to 1 +[worker node per minute], the number of instance startup requests would exceed +Slurm's ability to manage the requests resulting in *power_save programs getting +backlogged* error messages. diff --git a/jetstreamiuenv/group_vars/all.yml b/jetstreamiuenv/group_vars/all.yml index deed1fb0..e802a7dc 100644 --- a/jetstreamiuenv/group_vars/all.yml +++ b/jetstreamiuenv/group_vars/all.yml @@ -1,5 +1,9 @@ --- +worker_instance_type: m1.large +# CentOS-7-x86_64-GenericCloud-1607 +worker_image_id: 1790e5c8-315a-4b9b-8b1f-46e47330d3cc + all_groups: - name: G-803372 gid: 803372 @@ -22,5 +26,10 @@ all_users: home: /home/g2main shell: /bin/bash -controller_name: jetstream-iu0 -controller_ip: 10.0.0.10 +controller_name: jetstream-iu-slurm-controller +controller_ip: 10.0.0.11 + +jetstream_nfs_filesystems: + - device: "10.0.0.11:/scratch0" + dir: scratch0 + mountpoint: /jetstream/iu-scratch0 diff --git a/jetstreamiuenv/group_vars/controllers.yml b/jetstreamiuenv/group_vars/controllers.yml index c0013be6..d392e84a 100644 --- a/jetstreamiuenv/group_vars/controllers.yml +++ b/jetstreamiuenv/group_vars/controllers.yml @@ -10,7 +10,8 @@ group_packages: - libcurl-devel - nss-devel - openssl-devel - - slurm-drmaa + # FIXME: slurm-drmaa can't be installed on first playbook run because the depot repo is set up by the slurm role + # - slurm-drmaa group_files: - src: files/etc/dhclient.conf diff --git a/jetstreamiuenv/group_vars/galaxynodes.yml b/jetstreamiuenv/group_vars/galaxynodes.yml index ee62a753..889a5500 100644 --- a/jetstreamiuenv/group_vars/galaxynodes.yml +++ b/jetstreamiuenv/group_vars/galaxynodes.yml @@ -31,7 +31,10 @@ links: src: /cvmfs/data.galaxyproject.org/byhand/location force: yes -jetstream_nfs_filesystems: - - device: "10.0.0.10:/scratch0" - dir: scratch0 - mountpoint: /jetstream/iu-scratch0 +# Not being read by Ansible v2.2+ saying variable undefined when calling +# 'Remove static mounts' although the host group `jetstreamnfsclients` is a +# parent of `galaxynodes` server group in the inventory? Moved to all.yml. +# jetstream_nfs_filesystems: +# - device: "10.0.0.11:/scratch0" +# dir: scratch0 +# mountpoint: /jetstream/iu-scratch0 diff --git a/jetstreamiuenv/group_vars/slurmclients.yml b/jetstreamiuenv/group_vars/slurmclients.yml index 7e629634..b20b5ae7 100644 --- a/jetstreamiuenv/group_vars/slurmclients.yml +++ b/jetstreamiuenv/group_vars/slurmclients.yml @@ -13,7 +13,8 @@ group_users: home: /var/lib/slurm shell: /bin/bash -slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch +slurm_yum_repo_baseurl: https://s3.amazonaws.com/gxy-yum/el/$releasever/$basearch +# slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch slurmd_spool_dir: /var/lib/slurm/slurmd/slurmd.spool slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state @@ -21,7 +22,7 @@ slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state slurm_cluster_name: jetstream-iu slurm_nodes: - - real_memory: 29997 + - real_memory: 29996 sockets: 10 partition: large inventory_group: jetstream-iu-large diff --git a/jetstreamiuenv/inventory b/jetstreamiuenv/inventory index 4e3c6af1..c5b9794f 100644 --- a/jetstreamiuenv/inventory +++ b/jetstreamiuenv/inventory @@ -1,6 +1,6 @@ # looking for cvmfs1-iu0? It's in the galaxyenv - -jetstream-iu0.galaxyproject.org +jetstream-iu0.galaxyproject.org ansible_host="149.165.172.156" ansible_user="centos" ansible_ssh_private_key_file="~/.ssh/enis_afgan_galaxy_rsa" ansible_ssh_common_args='-o StrictHostKeyChecking=no -o CheckHostIP=no -o "UserKnownHostsFile /dev/null"' +# jetstream-iu0.galaxyproject.org ansible_connection=local # Uncomment on the controller [baseenv] jetstream-iu0.galaxyproject.org @@ -8,7 +8,7 @@ jetstream-iu0.galaxyproject.org [baseenv:children] galaxynodes -# "contoller" node(s) for this cloud (not necessarily a slurm controller) +# "controller" node(s) for this cloud (not necessarily a slurm controller) [controllers] jetstream-iu0.galaxyproject.org @@ -21,8 +21,8 @@ jetstream-iu0.galaxyproject.org [slurmclients:children] galaxynodes -;[slurmelasticservers] -;jetstream-iu0.galaxyproject.org +[slurmelasticservers] +jetstream-iu0.galaxyproject.org [cvmfsclients] [cvmfsclients:children] @@ -33,7 +33,7 @@ controllers [jetstreamnfsclients:children] galaxynodes -[surmexechosts] +[slurmexechosts] [slurmexechosts:children] galaxynodes @@ -42,11 +42,12 @@ galaxynodes jetstream-iu-large [jetstream-iu-large] -jetstream-iu-large0 ansible_host=10.0.0.20 -jetstream-iu-large1 ansible_host=10.0.0.21 -jetstream-iu-large2 ansible_host=10.0.0.22 -jetstream-iu-large3 ansible_host=10.0.0.23 -jetstream-iu-large4 ansible_host=10.0.0.66 -jetstream-iu-large5 ansible_host=10.0.0.67 -jetstream-iu-large6 ansible_host=10.0.0.68 -jetstream-iu-large7 ansible_host=10.0.0.69 +# jetstream-iu-large0 ansible_host=10.0.0.68 +# jetstream-iu-large1 ansible_host=10.0.0.67 +#jetstream-iu-large1 ansible_host=10.0.0.21 +#jetstream-iu-large2 ansible_host=10.0.0.22 +#jetstream-iu-large3 ansible_host=10.0.0.23 +#jetstream-iu-large4 ansible_host=10.0.0.66 +#jetstream-iu-large5 ansible_host=10.0.0.67 +#jetstream-iu-large6 ansible_host=10.0.0.68 +#jetstream-iu-large7 ansible_host=10.0.0.69 diff --git a/jetstreamiuenv/launch.yml b/jetstreamiuenv/launch.yml new file mode 120000 index 00000000..80cee732 --- /dev/null +++ b/jetstreamiuenv/launch.yml @@ -0,0 +1 @@ +../jetstream_common/launch.yml \ No newline at end of file diff --git a/jetstreamiuenv/terminate.yml b/jetstreamiuenv/terminate.yml new file mode 120000 index 00000000..478a873e --- /dev/null +++ b/jetstreamiuenv/terminate.yml @@ -0,0 +1 @@ +../jetstream_common/terminate.yml \ No newline at end of file diff --git a/jetstreamtaccenv/group_vars/slurmclients.yml b/jetstreamtaccenv/group_vars/slurmclients.yml index c37d8d7f..21891537 100644 --- a/jetstreamtaccenv/group_vars/slurmclients.yml +++ b/jetstreamtaccenv/group_vars/slurmclients.yml @@ -21,7 +21,7 @@ slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state slurm_cluster_name: jetstream-tacc slurm_nodes: - - real_memory: 29997 + - real_memory: 29996 sockets: 10 partition: large inventory_group: jetstream-tacc-large diff --git a/roles/slurm/tasks/elastic.yml b/roles/slurm/tasks/elastic.yml index 1ba85f2d..55df21a3 100644 --- a/roles/slurm/tasks/elastic.yml +++ b/roles/slurm/tasks/elastic.yml @@ -2,13 +2,83 @@ - name: Install virtualenv (yum) yum: - pkg: python-virtualenv + pkg: "{{ item }}" when: ansible_os_family == "RedHat" + with_items: + - python-virtualenv + - ipython - name: Create virtualenv for elastic components pip: name: "{{ item }}" virtualenv: "/opt/slurm_cloud_provision" + state: latest + with_items: + - pip + - setuptools + - cython + +- name: Install specific Ansible version in the venv + pip: + name: ansible + virtualenv: "/opt/slurm_cloud_provision" + version: 2.1.5.0 + +- name: Download Pyslurm + git: + repo: https://github.com/PySlurm/pyslurm + version: 16.05.5 + dest: /opt/pyslurm + +- name: Install Pyslurm + shell: "{{ item }}" + args: + chdir: /opt/pyslurm with_items: - - shade - - ansible + - /opt/slurm_cloud_provision/bin/python setup.py build + - /opt/slurm_cloud_provision/bin/python setup.py install + +- name: Download SlurmScale + git: + repo: https://github.com/afgane/slurmscale + dest: /opt/slurm_cloud_provision/slurmscale + ignore_errors: yes + +- name: Install SlurmScale + shell: /opt/slurm_cloud_provision/bin/python setup.py install + args: + chdir: /opt/slurm_cloud_provision/slurmscale + +- name: Copy SlurmScale config file + copy: + remote_src: True + src: /opt/slurm_cloud_provision/slurmscale/slurmscale.ini.sample + dest: /opt/slurm_cloud_provision/slurmscale/slurmscale.ini + +- name: Copy this playbook + synchronize: # sync is much faster than copy but requires chown task below + src: "{{ playbook_dir }}/../" + dest: /opt/slurm_cloud_provision/infrastructure-playbook/ + +- name: Place vault pass + copy: + src: "../jetstream_common/files/elasticity/vp" + dest: /root/.vault_pass + mode: "0600" + +- name: Place CloudBridge config + copy: + src: "../jetstream_common/files/elasticity/cloudbridge" + dest: /root/.cloudbridge + mode: "0600" + +- name: Ensure root's .ssh dir exists + file: + path: /root/.ssh + state: directory + +- name: Place PK + copy: + src: "../jetstream_common/files/elasticity/elasticity_kp" + dest: /root/.ssh/id_rsa + mode: "0600" diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index 25bec879..9cb9bad6 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -38,13 +38,17 @@ #- slurm-torque when: ansible_os_family == "RedHat" -# FIXME: this task will fail if slurmservers[0] has not already completed the slurm.conf task that follows it +- name: Check if any workers are defined locally + debug: + msg: '{{ groups[item.inventory_group] | length > 0 }}' + with_items: "{{ slurm_nodes }}" + register: local_workers + - name: Acquire hostlist command: scontrol show hostlist {{ groups[item.inventory_group] | join(",") }} - with_items: slurm_nodes - delegate_to: "{{ groups['slurmservers'][0] }}" - run_once: true + with_items: "{{slurm_nodes}}" register: slurm_hostlists + when: local_workers.results[0].msg == True and 'slurmexechosts' not in group_names - name: Install slurm.conf template: @@ -96,7 +100,7 @@ group: slurm mode: 0755 state: directory - when: "'slurmservers' in group_names" + when: "'slurmservers' in group_names or 'slurmexechosts' in group_names" - name: Check munge dir file: