diff --git a/ansible.cfg b/ansible.cfg index 4516e209..f4d8de4d 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -1,4 +1,6 @@ [defaults] +host_key_checking = False +callback_whitelist = profile_tasks # include roles from galaxyproject/ansible-common-roles roles_path = roles:common_roles @@ -17,3 +19,6 @@ pipelining = True # These are necessary for cloud instances #pipelining = False #ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no + +[paramiko_connection] +record_host_keys = False diff --git a/jetstream_common/files/elasticity/elasticity_kp b/jetstream_common/files/elasticity/elasticity_kp new file mode 100644 index 00000000..1620a97e --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp @@ -0,0 +1,90 @@ +$ANSIBLE_VAULT;1.1;AES256 +64646363613565643537363537623663373961326364366636343061323666396235326430623538 +6263373731333466383561386139653035313362326237660a366639333261643337643332666164 +32326564623561663435666162343930396634656130313039363930353664666336323839356339 +3566623532316233630a333936353261643865633236323930363132613433366236663638666535 +65376430393663326237626239373365363263316664353163653664363932383134633539303764 +38363132616265633134656362616466396635383534336133333333613566666638303936643566 +65393431656337323965623564613366346130663233663761306662373466313364363366393561 +32393964643664386132313139366630326535323137636235393234383861376632613339613961 +39376231303465383131333035383262383032333031303963616336363738616435313534363561 +62336532633861303438396263663339653436653566633436353232326532633237343566346338 +61633837623235653465623635363338643662616265393935373532653237616366613637633631 +35636237633433623863393766396430386264623631656336663661633037306262376666386130 +37343938393766643534333662633436386635343262333263343236356639333035396138656166 +63623334373930356139653664623633316437616233366339623039386633373438386530373531 +33633265353030376162363938333038343432626565363463643137353635613933313433383561 +33623938623530643062656266336630366131383839636562636364303364323366653837663935 +39366532353635323137353631383132323435616636366636653965343637616164653236323662 +38643537643262653032393963303861326239303064396363376262346439346264356435316561 +39666461303234663136363264343962356539333632343336646637633533303638373536313064 +32353532396162356535353634333236646162633463616335653665333833323331396264346362 +36633332396164316663353439343737316631376666633862366232353535623135303232323838 +36373164303062613030336363306336343762313534626265663064633632303131373730333532 +36343439646238303531633862363361663431623934396566393733353838666564316162623730 +66313263313037376635393937333266636561636634663136623661626530316164633536646339 +38653937383866303065313637366138623933336236636138383236333439616463343863623139 +66396336353439656237313336636462366364653038316662353465336130383433633363313633 +62353264373835393265323131363137353139313833373335363366373636346639313538653436 +63306535633665303735323732373266663864326666666239393134636530323061636466393530 +32366630376661613637626362396434623332613561383838356264623863383561303564333936 +30383834303661366536313038323864383438396438373661656236363566366334343237306665 +65343537626639386161373830643866336639383765353363363465643732333366623562356262 +30636535306630373165363639316363643835313765313562343835396461333733613130383865 +30313866666633373432626361393633616534356132663165356332326635336438316463303035 +62653439663534393336363635613163383630363565366562616331646164313139653333376633 +34333062653436383463346561343934333865386436666265366537613464613530383532323231 +35363638343466656630333334633432653164393936346139633233646334653631386131356537 +37616165353965633333353031666166363737363366616261373532666136363330353263623362 +35636636306431316336623233383966633939636264653432643865393463383130386134323736 +63623663623230346266343839386132303631343036333334393135643464393837326234623238 +33306330313237386362623265376232343630653535313532376563373031616635326635653336 +63303232623166663866313539333337323839643934366363333733636437623163366233383062 +34306133373238333963626333363830626261333235376465373661643532336530623431313731 +61383634376437373135353831376364306639646565346234386265656333396337646233643937 +65343336356531633534353033663135666136653565303764346264326437643331643331623939 +37313336616438323538623765303635303637313263636131353965316434653837306466393836 +61373464306133613164383236333230363139643061623338363965333230613963356230313939 +37396131326634613164393231636635303633373461376134323630383336323961663137636264 +62643463643132363634353164383232613866356630366536393039653465366230636234386566 +39393032396462663634353739336664646561396361633431336638316266303837313633363263 +37656235623462646166643132326462373930656465646436346333343234363339656136373431 +33306466613861633632643063656266313762333931383335636137386165643237396564356635 +61383934386239616638623333626532373966623163373466326466643964363733666134656562 +35653963613064383336363338323766646436663932376131653664313964366139313130643733 +61326131613336356430346432356166653466333534333136356533613939376333613237616362 +64626638396263356138343132333633613261386430313932653536303565663535336631333830 +36653436616539363065383335366435626138633037383832366434316530613938383138303964 +38306334346638376639313763383934633931656530393761373833396565333566656130356662 +30636431663365616532383764386338356137643736356431393536346635313265616338633630 +61386432343630333734353834316664326431363335626365663335346239313633363135643563 +64646135363630396630616161326334393439613431636231353831613061323963353434643136 +37323865343361306165343534323831613234383432303632333039643835336664396564306161 +62366331653061363931313634333131366531613464383065316362393964323030643432623666 +30386634646633376236643062393032643265376536303263313562653165643531613066396363 +38666138316532626434303161316262646230373164656639306264303430323461643536653033 +38333034356464336538653130323036646235376437346436383339333963633438343965636132 +33316465356634343633363131663533336139343134323265633061326431653137343261653232 +34666162626262373032316633643339376334613836656464623236653938343535623962396539 +36653034343930393163663431393732306264316662316230663564336361653665356566616165 +62646530623561613466303362383562663065346533306338633065613437666236623161326530 +66656535356635646661313966386437653366613661656138366664373162336333373932613765 +30323861326531333735626537393036633931343962663031663561333336333634333938323134 +31303630306137626130323731616432353635343365333666383639623064376264346536353738 +61656337313632383564336439363536636430396335653234376662393465653832363436366635 +65373536376631353633336230353236343666366636666236386264343136623963376539323431 +38656332353933623631316338393939313766636537333564346234613238613065626566613366 +63646533326136326462623439313232616534646139373961386439616563333164656565626137 +37653538636430663265356364363763626532386262353861666266323330303965323939336465 +33303931353830333565666163373766353265653464616462623165616335663764316237356335 +31393165323331356136306662393462336562353635323734313963626337393662323932666361 +30656664353465623031623631343264613662316266363364373634313966643839303136386564 +62643862326566383333326462646131626461316238383561653531333062663333653134326463 +33313566653963393330363337303435353531656461663436663161336631326533633834346536 +39323464363930636631323163653333623565613134393366656336633664613563623339366531 +61353436623466316664373135623334396432626163353065646536613331363865393436333339 +35383132383731373539373632366633643063636664316630383935376262636561376338336639 +62326636393664373736393635663833663530376633633431613765313430393162306431363463 +64383435386262363764663434633264333666383335356361643836383766353933643930336132 +66323139363630343463623765626138666531613431343733383331366132323234396237376535 +3562 diff --git a/jetstream_common/files/elasticity/elasticity_kp.pub b/jetstream_common/files/elasticity/elasticity_kp.pub new file mode 100644 index 00000000..92b4fb7d --- /dev/null +++ b/jetstream_common/files/elasticity/elasticity_kp.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCpw4D9NR0at80ehc84kTchP2aDCV9HbZetptVCwc2/sdzUHvecUXNRc61RYKttF3kdgWO8UhGWjUUjNCMOR/gYxV1Ui1hpihFExdg2tHbXpkArdrHJK6n+QWf7qYrNTLFVFH2XVxouzJ4k37slGNeQMWczRHS2ZUL+cbfYOEWe8RhWRqhRjvAdxLnQt1dg/sKc0MqeDwEVAmdgNHUozbeUxKVVsoxWBeUiLK7xTd0PQ/jXRVWY1pYQ2xfjvhm1DiylQO/5fox8Z5MmyqBSAsKdGgfe47K335QmmrEKtq/6O1AAnm6D0Pkqky3EgzT7g/DpT2n8VyCWAdFIvrQAg65V Generated-by-Nova diff --git a/jetstream_common/files/elasticity/vp b/jetstream_common/files/elasticity/vp new file mode 100644 index 00000000..b82b0bc4 --- /dev/null +++ b/jetstream_common/files/elasticity/vp @@ -0,0 +1,7 @@ +$ANSIBLE_VAULT;1.1;AES256 +39343936396638626438353764366237336637336537383966393234303534393730303635653564 +3837353262653931633761336561623666653464303032630a323933393438376438663332666238 +34333133326533313136363935373837376538636230653239353430356330633037393766373763 +3031376537643338370a333939656563646539356563306666306131303638666666633036383161 +34636336623039376638633430353138363031366230633833633234616238613165303534336464 +3634326232303961313037333636646562393732666334653033 diff --git a/jetstream_common/files/slurm/launch b/jetstream_common/files/slurm/launch index 68f7f223..5c73d26e 100755 --- a/jetstream_common/files/slurm/launch +++ b/jetstream_common/files/slurm/launch @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,5 +17,5 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/launch.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_launch=$instances" ) 2>&1 | tee "$LOG_DIR/launch.`date +%s`.${1}.log" diff --git a/jetstream_common/files/slurm/terminate b/jetstream_common/files/slurm/terminate index 1bb954c1..4b78dc79 100755 --- a/jetstream_common/files/slurm/terminate +++ b/jetstream_common/files/slurm/terminate @@ -9,7 +9,7 @@ LOG_DIR=/var/log/slurm/launch set -xv VENV=/opt/slurm_cloud_provision -PLAYBOOK=/home/centos/infrastructure-playbook +PLAYBOOK=/opt/slurm_cloud_provision/infrastructure-playbook : ${HOME:=/var/lib/slurm} export HOME @@ -17,6 +17,6 @@ export HOME . $VENV/bin/activate cd $PLAYBOOK instances=`scontrol show hostnames "$1" | tr '\n' ',' | sed 's/,$//'` -ansible-playbook -i jetstreamenv/instance_inventory jetstreamenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" +ansible-playbook -i jetstreamiuenv/instance_inventory jetstreamiuenv/terminate.yml --vault=/var/lib/slurm/.vault-pass --extra-vars="jetstream_instances_to_terminate=$instances" ) 2>&1 | tee "$LOG_DIR/terminate`date +%s`.${1}.log" true diff --git a/jetstream_common/launch.yml b/jetstream_common/launch.yml index ad990ec2..e128d4b4 100644 --- a/jetstream_common/launch.yml +++ b/jetstream_common/launch.yml @@ -8,14 +8,14 @@ tasks: - name: Launch new instance(s) os_server: - cloud: jetstream_iu + cloud: jetstream_iu # Details defined in clouds.yaml, including auth name: "{{ item }}" - image: "CentOS 7 Stock 1601" - flavor: "m1.large" - key_name: "slurm_jetstream0" - security_groups: "default,usegalaxy-control" + image: "{{ worker_image_id }}" + flavor: "{{ worker_instance_type }}" + key_name: "elasticity_kp" + security_groups: "gxy-workers-sg" nics: - - net-name: "usegalaxy" + - net-name: "gxy-slurm-net" auto_ip: no with_items: "{{ jetstream_instances_to_launch.split(',') }}" register: jetstream_instances_launched @@ -29,7 +29,8 @@ add_host: name: "{{ item.server.name }}" ansible_host: "{{ item.server.private_v4 }}" - groups: "baseenv,galaxynodes,slurmclients,slurmexechosts" + groups: + "baseenv,galaxynodes,slurmclients,slurmexechosts,jetstreamnfsclients" with_items: "{{ jetstream_instances_launched.results }}" - name: Spin waiting for instance(s) to become accessible diff --git a/jetstream_common/playbook.yml b/jetstream_common/playbook.yml index 5ee720e7..110a57c7 100644 --- a/jetstream_common/playbook.yml +++ b/jetstream_common/playbook.yml @@ -5,6 +5,7 @@ remote_user: centos become: yes become_method: sudo + connection: paramiko pre_tasks: - name: Locate secret group variable files local_action: @@ -65,7 +66,7 @@ dest: /etc/hosts - name: Install supervisor - hosts: all + hosts: controllers remote_user: centos become: yes become_method: sudo diff --git a/jetstream_common/secret_group_vars/clouds.yaml b/jetstream_common/secret_group_vars/clouds.yaml new file mode 100644 index 00000000..b4840d3a --- /dev/null +++ b/jetstream_common/secret_group_vars/clouds.yaml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +39376264656237656561343363313364633637376535386130343039643136636636613964333961 +3231323431356661333264646533343630633437376531640a313961613938343931653861316431 +38313665363931633832613133663438323933663135386262613732303863356464326262356134 +6636633333363938330a343964636437363961613162323635393033616633323838383835616565 +38633030373232636233376132653462613664343436313934653566386332376436376461333063 +35333030353737366530303635643861333166353736353039386130316662376439633239626534 +66636636346232303263656339306135626238633461643832373933343762653839613434636134 +35343939613332653431346664633737373363613537323734323637653466663039313136353734 +32313930376332363733383462656336323635616139366364663236393663316534656466323532 +38346430613236326238313463363262643836643533643331316331653665646430343637303566 +65323536373864623538336336383938363865626432333939353766666433646433663239363830 +65336430666636656335383065356638653534663865383639323562306232353932626166636261 +39353933323365326133623264653332373533396638326330623161666637613936306465666162 +39623136373339383261326366376231666437666362613464326333396332326266323934626232 +31363466653636326362393532666139633039393533343130373864313364326562333238336435 +38333031326231656334323065663366616133623637306434323461393633613064383133653731 +3738 diff --git a/jetstream_common/secret_group_vars/controllers.yml b/jetstream_common/secret_group_vars/controllers.yml index 2312f6fd..d7ad55bb 100644 --- a/jetstream_common/secret_group_vars/controllers.yml +++ b/jetstream_common/secret_group_vars/controllers.yml @@ -1,53 +1,54 @@ $ANSIBLE_VAULT;1.1;AES256 -62393064353039356130653437363630333866386262613034353837393530383164663666613936 -3237386139643131626339663030623436656663653630660a363764326364376432306363613432 -39356266356430663264343561333038626535313738663437326135653032626333393965396130 -3031636164646363610a633533353335396239656164346235366165363161646565336433353438 -65326339616438613839613735336535356163393639303531366230323566353362623664626165 -37363666633461396534346166373662316565386565656634633037643765643630386533393333 -61646163646235363362306138306565366438623431663735313938373438316633333637313136 -62363566666362356262326530393531326336326532343437333861326531396531393331633635 -61626135663065363264333034386632653930643339316235326462363438323464373337393437 -34383734353438373563663261366564306336323564313335653737636430393032613235663136 -34343336303135346338306232633263633532333739653733343036316330633165346639323936 -61366666366438396335656631643837333966303266303036393865366138343339653431616634 -39646130646539346232323533373437396238643232376137633936613832333535383862656166 -62656233653533363337623164373334623136346639643330333836386538353363366461633136 -35613234623865646163313937346266326137333465616466393133626135323562623934636565 -65333231363964323730656561366634313762633432633962623634623836363436386663383339 -36303336323166653631633731623136633764663331366531626131656363646337393266323662 -63313836316666653061373761653032366165393064363832343237373039623564626530323530 -39366462666336343363613330653937353063313765306435353761626530393936646461636332 -32356131373937333964616139313962623635346531346133316661303262396631613538633135 -38383061353135313163643565616464333565616132326265636663363461346530653361373362 -31333836373563653331653461333533383066666463613132373561643164633032396436336166 -63373963363663323361386239323361376235616162623330613939333130613432313965303466 -64333431656239653765326365346534363737643439336230616661346466353635353662303735 -30336535636535373834366631623334363038383434303661366638356462303363313236303434 -62353835373634613531306463373632333839393038346232353063333661336131326437353835 -38653339373830646637626336613136616138356562636137623932373038373638316439386136 -37366432616239376162316263373633663266316438643036633133396639333035363138666261 -65336332663965393561393136646636333031316639343830653333633162316265323361656462 -34336265633666306361343130633437626235303464633539373561336439303835393136326337 -62393334333363353663343636646664333439343930396166363236323438366464383931346439 -65616233313366316234666164633366313962643061363235386562646532363738313465613032 -65663966643936663831663366336662363534353531343835663265623434633836383832383466 -37343732306436303131666337366231636234333064616536306661383336393830386132336261 -32336634333532313939343233336239393731636330326231613334663731663535363032373166 -34366264306334616537336532393531643333393165353563383866373436323831363230346539 -39656464373264323734663161643033393466626338633837393837343062656332663037316131 -65373831356365623162343366343363336631663335353335396164623634353762316631313235 -32356130363337393331383531376431653137663235633033313661313965633161623530616466 -36326462363431333431303437663136653661386131383466333334343262663666386139363832 -61353933653333616130343361336130373136373466396431363061353264303061626361396466 -30313061393432303332383239363162333932376434363330363530353336623464656131643533 -65353363396661656464636338366263363235343536656565663936663037323735363933663266 -64363131343764643832393130346363386537633431623663343761356366376264333134383438 -30616666393363646438326136303235346234373266366139393136336665613265346133303836 -65343263343734353165333066393164323535346564656264383066663566306162306335643764 -66396236313135633535333231313965303866326362336536383232386661366239373030333762 -30663536656462663764323037303637663864383735306137646431353965366661663464613064 -33356232623731353964393933623235386565653436333636333036636165333039613835336538 -30323464643164333864346538333762653838393466393439393836373933623237613033626233 -33616438373131306630326431396665373734613531346633306437616230393138306665306130 -36303635616265653935 +34366663643338623731303138343366373164316639663931643233386434356163633637636434 +3131663331313366356161633838666162623536376438610a313937386634653264663938316139 +33333765383836326465303263333962346233393932623464363462653363346137336431383439 +6330346465613835350a343738373238353037653363323762666138363332636266633436303361 +33383130386233333131636266313333303366383439356462303063656661643835333239613862 +65343733343735663936393236653264353933336430343930646235333564316561643364353066 +61393838633131366333373236626337613165383062633730323861323238633764366433363837 +30396135346262336162386432333565386437326235643161396434666238326435666461656666 +65313365336563666231383638306163323931363962393564363964333132626636396363643333 +39336366313032313630386464356433396263333163666264323834633066346265663735613761 +62316639363438383232343661636664666639376431343138633539653033333965633438613535 +62663230316637306366633837643930323862393963616430363137396531396165626335393537 +61633365653864653138366139656163373532363836326261306265376631613438653963626239 +66656430336361373335333133346432353937613638313434373134353836336634646566666365 +39663764643331626232343865306138383537393736666636333136356534396561306239636333 +36646430303964336263656432323736353437343837373931656464333362636361356264636163 +33356161373263336533356138306630363666366339636339373064333062326661656132373738 +39333930393532306638306562616135303965356136616362616263393164363639663335663364 +37393466306237383132616334373430386330336665656263616264333336663737386639353930 +66306534353336316634626638393665356563613765363062393866376461316433653866323833 +30386465666130643562626333333139303466343865393830356663303733653631643336323231 +62636536653238613563643336343430366239343033336637373066376136376233306532646636 +62363061376634333338616263383532353233363866356233363461333830613134386336316632 +61643838323936663061366363356566356639373437663338376234303837646137646163653833 +30633033343031313432656361363830343663306361623931373736633766333764316265303833 +64303839623832643536336531633262636564343039363335313965323339323964373938333039 +65363963326664633334353233666464396566356435346465633034373033666563316535666135 +65333965663737333933323263373637646138383635316230373664356664363932366535623038 +33386661366534613066356637643764393163633035613334396237343838313065333730636134 +39393135383931333533343632653632613737626265643637666130306563323430643736613839 +39333663323265633466386535623166383331326337376131303730636466623636626262623839 +34313730623033363465303833613537373234626362666363336664353736383938633338613666 +66366632323433353236653264623863356532323862643931353238396439363835316639346666 +33353230666339366530323061386636643132363064666537656130333265653830373532353561 +63386531666532636333393333343537626130656237373339613261623563383331333434343336 +36396161323232376132363736316162646366316666396239626462663034663134613661383637 +39616264636664323563356537633566613939653831353265323137333565353630653631366234 +39616131643732306432313865376261343836386334613835326164326232343632393432613632 +62316633373832383465343932666135346237626431636632363830343962313034613365353537 +36646631323862376636383532373536383431366361343163373861656636386634616135356663 +32346332316637663535376136636138376563663131323031373231393162306333333066363934 +34333831303134376233643935373662353930383465373761646136613666663965653964616237 +38363364303532336638396462396435653565393932316361303131346536653139323631633733 +35613861393533346366636565396533316535633035363038366432626361653735393036303132 +35343635393634373362363337376136633664343432646634323563333863613433393835313334 +62323831373462666461346365613061326230666538373962376130356535363634363865396332 +34616334636663663639633966383831323361383532623239633930633065363534393063363430 +62353631313139356364626666373739383336613234373433663464623339663765623366646337 +38383934616434356465386239336233336266303961346466373035346433343063323130316530 +34343361326532356366656334303836656263643734643531633837633938313636303533333434 +37363834613666303831636332343263386138383866376663336639616232333761313465313430 +32616135373031356261333265373936666562373439636261663339653832373163633039343162 +6432 diff --git a/jetstream_common/templates/slurm/slurm.conf.elastic.j2 b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 new file mode 100644 index 00000000..80270c90 --- /dev/null +++ b/jetstream_common/templates/slurm/slurm.conf.elastic.j2 @@ -0,0 +1,56 @@ +## +## This file is maintained by Ansible - CHANGES WILL BE OVERWRITTEN +## +ControlMachine={{ controller_name }} +ControlAddr={{ controller_ip }} +# +AuthType=auth/munge +FastSchedule=1 +JobCompLoc=/var/log/slurm/slurm.job.log +JobCompType=jobcomp/filetxt +PluginDir=/usr/lib64/slurm +SchedulerType=sched/backfill +#SelectType=select/cons_res +#SelectTypeParameters=CR_CPU_Memory +SelectType=select/linear +SlurmUser=slurm +SlurmctldPort=7002 +SlurmctldTimeout=300 +SlurmdPort=7003 +SlurmdSpoolDir=/var/lib/slurm/slurmd/slurmd.spool +SlurmdTimeout=300 +StateSaveLocation=/var/lib/slurm/slurmctld/slurm.state +SwitchType=switch/none +DefaultStorageLoc=/var/log/slurm/slurm_accounting +# AccountingStorageType=accounting_storage/slurmdbd +# AccountingStorageHost=galaxy02.tacc.utexas.edu +#AccountingStoragePort=6819 +# AccountingStoragePort=30001 +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=task=15 +ProctrackType=proctrack/linuxproc +ClusterName={{ slurm_cluster_name }} +ReturnToService=1 +# Elastic config +ResumeRate=1 +ResumeProgram=/opt/slurm_cloud_provision/bin/launch +SuspendProgram=/opt/slurm_cloud_provision/bin/terminate +ResumeTimeout=300 +SuspendTime=180 +SuspendRate=5 +TreeWidth=256 +BatchStartTimeout=780 +# +# Node Configurations +# +NodeName=jetstream-iu-elastic[1-64] State=CLOUD +# +# Partition Configurations +# +# PartitionName=normal Default=YES State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES +PartitionName=multi State=UP MaxTime=48:20:00 MaxNodes=1 Nodes=jetstream-iu-elastic[1-64] LLN=YES + +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmctldDebug=9 +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmdDebug=debug5 diff --git a/jetstream_common/test_slurm.sh b/jetstream_common/test_slurm.sh new file mode 100644 index 00000000..e27b5049 --- /dev/null +++ b/jetstream_common/test_slurm.sh @@ -0,0 +1,9 @@ +#!/bin/bash +#SBATCH -n 1 +#SBATCH --partition=multi +#SBATCH -J test_job_name +hn=`hostname` +of="/jetstream/scratch0/jobs/sample_$hn.out" +sleep 5 +ls ~ > $of +date >> $of diff --git a/jetstreamiuenv/README.md b/jetstreamiuenv/README.md new file mode 100644 index 00000000..aa3b8805 --- /dev/null +++ b/jetstreamiuenv/README.md @@ -0,0 +1,84 @@ +Set up a Slurm cluster for use by Galaxy Main on the Jetstream cloud. + +The overarching architecture of this setup is that Galaxy Main uses a Slurm +cluster composed of multiple sub-clusters. Each sub-cluster runs a Slurm control +process and any number of workers to run the jobs. Sub-cluster controllers +also run Pulsar that is used for data staging. + +Elastic scaling cluster setup +============================= +This playbook can be used to build an elastic Slurm cluster that, based on the +job load, acquires and subsequently releases worker instances. The playbook has +been tailored for use with Galaxy Main server and Jetstream IU region. + +### Setup the cloud account +Before the playbook can be run, is is necessary to create a suitable cloud +environment for it. Start by importing the public portion of a key pair +`files/elasticity/elasticity_kp.pub` from this repo into your account. Next, +create a security group called `gxy-workers-sg` with a rule to allow all +communication amongst instances belonging to the same security group. Also +create a public network called `gxy-slurm-net`. Add a subnet and attach it to a +public network via a router. Finally, create an empty volume that will be used +as a NFS-shared cluster file system. Names of all these resources can be found +and changed in `launch.yml`. + +### Launch a controller instance +We'll next launch an instance that will serve as the controller for this cluster. This instance needs to belong to `gxy-workers-sg` security group. You probably +want to create another security group that allows ssh connections and associate the instance +with it as well. You can reuse the `elasticity_kp` key pair but note that by +default, the playbook will override `authorized_keys` file on the instance and +place the set of keys available in `secret_group_vars/controllers.yml` for +`centos`, so make sure your key is included in the file or you will get locked +out of the instance. You can override this setting by defining a key via +`authorized_key_users` variable. Name the instance `jetstream-iu-slurm- +controller`. After launching the instance, associate a public IP address with it +and attach the earlier created volume (as device `/dev/sdb`). + +### Run the playbook +Well, before running the playbook, edit `group_vars/all.yml` and +`group_vars/galaxynodes.yml` to update the IP address of the controller +instance and specify its private IP address. Also, update `inventory` file to include the public IP of the instance under `jetstream-iu0.galaxyproject.org` variable: +``` +jetstream-iu0.galaxyproject.org ansible_ssh_host="149.165.XXX.XXX" ansible_ssh_user="centos" ansible_ssh_private_key_file="kp" ansible_ssh_common_args='-o StrictHostKeyChecking=no -o CheckHostIP=no -o "UserKnownHostsFile /dev/null"' +``` + +Elastic scaling of worker nodes is handled by an Ansible playbook that Slurm calls on (to do so, Slurm calls a script available in `files/slurm/launch`). With that, Ansible needs to know about the credentials for the target cloud; these need to be made available in a a file called `clouds.yaml` in the root directory of this repo. The contents of the file should look like the following: +``` +clouds: + jetstream_iu: + auth: + username: 'username' + password: 'pwd' + project_name: 'project_name' + user_domain_name: 'tacc' + project_domain_name: 'tacc' + auth_url: 'https://jblb.jetstream-cloud.org:35357/v3' +``` + + Then, run the playbook with: +``` +ansible-playbook -i jetstreamiuenv/inventory jetstreamiuenv/playbook.yml --limit=controllers +``` + +On average, the playbook takes 4-5 minutes to run. + +### Verify it works +To verify the controller instance and the elastic scaling work, ssh to the instance and run a test job. A sample job script is included in the repo as `test_slurm.sh`. If using this job script, first create a directory for its output on the NFS file system: +``` +sudo mkdir /jetstream/scratch0/jobs +sudo chown centos:centos /jetstream/scratch0/jobs +``` +As `centos` user, submit the job script with +``` +sbatch /opt/slurm_cloud_provision/infrastructure-playbook/jetstream_common/test_slurm.sh +``` +In a couple of minutes, job output should be available in `/jetstream/scratch0/jobs`. + +Log file for the Slurm controller process is available in `/var/log/slurm/slurmctld.log` . Logs for the elastic launch/terminate process are available in `/var/log/slurm/launch/`. + +### Elasticity config options +There are some configuration options that can be changed for the cluster elasticity parameters. Instance type to be launched is supplied in `group_vars/all.yml` as `worker_instance_type`. `worker_image_id` can also be updated there. The amount of time Slurm will keep an idle instance around can be defined in `templates/slurm/slurm.conf.elastic.j2` under `SuspendTime` (value is in seconds). For other `slurm.conf` options, see [*slurm.conf* docs](http://slurm.schedmd.com/slurm.conf.html). + +Static cluster setup +==================== +*TODO* diff --git a/jetstreamiuenv/group_vars/all.yml b/jetstreamiuenv/group_vars/all.yml index deed1fb0..d2f54980 100644 --- a/jetstreamiuenv/group_vars/all.yml +++ b/jetstreamiuenv/group_vars/all.yml @@ -1,5 +1,9 @@ --- +worker_instance_type: m1.tiny +# CentOS-7-x86_64-GenericCloud-1607 +worker_image_id: 1790e5c8-315a-4b9b-8b1f-46e47330d3cc + all_groups: - name: G-803372 gid: 803372 @@ -22,5 +26,5 @@ all_users: home: /home/g2main shell: /bin/bash -controller_name: jetstream-iu0 -controller_ip: 10.0.0.10 +controller_name: jetstream-iu-slurm-controller +controller_ip: 10.0.0.13 diff --git a/jetstreamiuenv/group_vars/controllers.yml b/jetstreamiuenv/group_vars/controllers.yml index 3349ec56..45debaed 100644 --- a/jetstreamiuenv/group_vars/controllers.yml +++ b/jetstreamiuenv/group_vars/controllers.yml @@ -9,7 +9,8 @@ group_packages: - libcurl-devel - nss-devel - openssl-devel - - slurm-drmaa + # FIXME: slurm-drmaa can't be installed on first playbook run because the depot repo is set up by the slurm role + # - slurm-drmaa jetstream_scratch0_device: /dev/sdb diff --git a/jetstreamiuenv/group_vars/galaxynodes.yml b/jetstreamiuenv/group_vars/galaxynodes.yml index ee62a753..be6e41b8 100644 --- a/jetstreamiuenv/group_vars/galaxynodes.yml +++ b/jetstreamiuenv/group_vars/galaxynodes.yml @@ -32,6 +32,6 @@ links: force: yes jetstream_nfs_filesystems: - - device: "10.0.0.10:/scratch0" + - device: "10.0.0.13:/scratch0" dir: scratch0 mountpoint: /jetstream/iu-scratch0 diff --git a/jetstreamiuenv/group_vars/slurmclients.yml b/jetstreamiuenv/group_vars/slurmclients.yml index 7e629634..f6c7767c 100644 --- a/jetstreamiuenv/group_vars/slurmclients.yml +++ b/jetstreamiuenv/group_vars/slurmclients.yml @@ -13,7 +13,8 @@ group_users: home: /var/lib/slurm shell: /bin/bash -slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch +slurm_yum_repo_baseurl: https://s3.amazonaws.com/gxy-yum/el/$releasever/$basearch +# slurm_yum_repo_baseurl: https://depot.galaxyproject.org/yum/el/$releasever/$basearch slurmd_spool_dir: /var/lib/slurm/slurmd/slurmd.spool slurmctld_state_dir: /var/lib/slurm/slurmctld/slurm.state diff --git a/jetstreamiuenv/inventory b/jetstreamiuenv/inventory index 4e3c6af1..a5cd6db1 100644 --- a/jetstreamiuenv/inventory +++ b/jetstreamiuenv/inventory @@ -1,5 +1,4 @@ # looking for cvmfs1-iu0? It's in the galaxyenv - jetstream-iu0.galaxyproject.org [baseenv] @@ -21,8 +20,8 @@ jetstream-iu0.galaxyproject.org [slurmclients:children] galaxynodes -;[slurmelasticservers] -;jetstream-iu0.galaxyproject.org +[slurmelasticservers] +jetstream-iu0.galaxyproject.org [cvmfsclients] [cvmfsclients:children] @@ -33,7 +32,7 @@ controllers [jetstreamnfsclients:children] galaxynodes -[surmexechosts] +[slurmexechosts] [slurmexechosts:children] galaxynodes @@ -42,11 +41,11 @@ galaxynodes jetstream-iu-large [jetstream-iu-large] -jetstream-iu-large0 ansible_host=10.0.0.20 -jetstream-iu-large1 ansible_host=10.0.0.21 -jetstream-iu-large2 ansible_host=10.0.0.22 -jetstream-iu-large3 ansible_host=10.0.0.23 -jetstream-iu-large4 ansible_host=10.0.0.66 -jetstream-iu-large5 ansible_host=10.0.0.67 -jetstream-iu-large6 ansible_host=10.0.0.68 -jetstream-iu-large7 ansible_host=10.0.0.69 +#jetstream-iu-large0 ansible_host=10.0.0.20 +#jetstream-iu-large1 ansible_host=10.0.0.21 +#jetstream-iu-large2 ansible_host=10.0.0.22 +#jetstream-iu-large3 ansible_host=10.0.0.23 +#jetstream-iu-large4 ansible_host=10.0.0.66 +#jetstream-iu-large5 ansible_host=10.0.0.67 +#jetstream-iu-large6 ansible_host=10.0.0.68 +#jetstream-iu-large7 ansible_host=10.0.0.69 diff --git a/jetstreamiuenv/launch.yml b/jetstreamiuenv/launch.yml new file mode 120000 index 00000000..80cee732 --- /dev/null +++ b/jetstreamiuenv/launch.yml @@ -0,0 +1 @@ +../jetstream_common/launch.yml \ No newline at end of file diff --git a/jetstreamiuenv/terminate.yml b/jetstreamiuenv/terminate.yml new file mode 120000 index 00000000..478a873e --- /dev/null +++ b/jetstreamiuenv/terminate.yml @@ -0,0 +1 @@ +../jetstream_common/terminate.yml \ No newline at end of file diff --git a/roles/slurm/tasks/elastic.yml b/roles/slurm/tasks/elastic.yml index 1ba85f2d..c922d9d9 100644 --- a/roles/slurm/tasks/elastic.yml +++ b/roles/slurm/tasks/elastic.yml @@ -9,6 +9,69 @@ pip: name: "{{ item }}" virtualenv: "/opt/slurm_cloud_provision" + state: latest with_items: + - pip # If not latest, shade does not install - shade - ansible + +- name: Copy elasticity scripts + copy: + src: "{{ item }}" + dest: /opt/slurm_cloud_provision/bin/ + mode: "0755" + owner: slurm + group: slurm + with_items: + - slurm/launch + - slurm/terminate + notify: + - restart slurmctld + +- name: Copy this playbook + synchronize: # sync is much faster than copy but requires chown task below + src: "{{ playbook_dir }}/../" + dest: /opt/slurm_cloud_provision/infrastructure-playbook/ + +- name: Ensure slurm's .config/openstack dir exists + file: + path: /var/lib/slurm/.config/openstack + state: directory + owner: slurm + group: slurm + +- name: Place decrypted clouds.yaml + copy: + src: secret_group_vars/clouds.yaml + dest: /var/lib/slurm/.config/openstack/clouds.yaml + owner: slurm + group: slurm + mode: "0600" + +- name: Set slurm user as the owner of the playbook + file: + path: /opt/slurm_cloud_provision/infrastructure-playbook/ + owner: slurm + group: slurm + recurse: yes + +- name: Place vault pass + copy: + src: "../jetstream_common/files/elasticity/vp" + dest: /var/lib/slurm/.vault-pass + owner: slurm + group: slurm + mode: "0600" + +- name: Ensure slurm's .ssh dir exists + file: + path: /var/lib/slurm/.ssh + state: directory + +- name: Place PK + copy: + src: "../jetstream_common/files/elasticity/elasticity_kp" + dest: /var/lib/slurm/.ssh/id_rsa + owner: slurm + group: slurm + mode: "0600" diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index 58925a67..d481c100 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -45,7 +45,7 @@ - name: Install slurm.conf template: - src: "{{ slurm_conf_src | default( 'templates/slurm/slurm.conf.j2' ) }}" + src: "{{ slurm_conf_src | default( 'templates/slurm/slurm.conf.elastic.j2' ) }}" dest: "{{ slurm_conf_dir }}/slurm.conf" owner: root group: root @@ -93,7 +93,7 @@ group: slurm mode: 0755 state: directory - when: "'slurmservers' in group_names" + when: "'slurmservers' in group_names or 'slurmexechosts' in group_names" - name: Check munge dir file: