diff --git a/cli/.isort.cfg b/.isort.cfg similarity index 69% rename from cli/.isort.cfg rename to .isort.cfg index ba8fba4664..4817d8b544 100644 --- a/cli/.isort.cfg +++ b/.isort.cfg @@ -1,7 +1,7 @@ [settings] line_length=120 known_future_library=future -known_third_party=boto3,botocore,awscli,tabulate,argparse,configparser,pytest +known_third_party=boto3,botocore,awscli,tabulate,argparse,configparser,pytest,pytest,pytest-datadir,pytest-html,pytest-rerunfailures,pytest-xdist,argparse,retrying,junitparser,Jinja2 # 3 - Vertical Hanging Indent # from third_party import ( # lib1, diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9c3fc5df58..75146b58dc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,15 +2,65 @@ CHANGELOG ========= +2.2.1 +===== + +**ENHANCEMENTS** + +* Add support for FSx Lustre in Centos 7. In case of custom AMI, FSx Lustre is + only supported with Centos 7.5 and Centos 7.6. +* Check AWS EC2 instance account limits before starting cluster creation +* Allow users to force job deletion with ``SGE`` scheduler + +**CHANGES** + +* Set default value to ``compute`` for ``placement_group`` option +* ``pcluster ssh``: use private IP when the public one is not available +* ``pcluster ssh``: now works also when stack is not completed as long as the master IP is available +* Remove unused dependency on ``awscli`` from ParallelCluster package + +**BUG FIXES** + +* ``awsbsub``: fix file upload with absolute path +* ``pcluster ssh``: fix issue that was preventing the command from working correctly when stack status is + ``UPDATE_ROLLBACK_COMPLETE`` +* Fix block device conversion to correctly attach EBS nvme volumes +* Wait for Torque scheduler initialization before completing master node setup +* ``pcluster version``: now works also when no ParallelCluster config is present +* Improve ``nodewatcher`` daemon logic to detect if a SGE compute node has running jobs + +**DOCS** + +* Add documentation on how to use FSx Lustre +* Add tutorial for encrypted EBS with a Custom KMS Key +* Add ``ebs_kms_key_id`` to Configuration section + +**TESTING** + +* Define a new framework to write and run ParallelCluster integration tests +* Improve scaling integration tests to detect over-scaling +* Implement integration tests for awsbatch scheduler +* Implement integration tests for FSx Lustre file system + 2.1.1 ===== -* China regions `cn-north-1` and `cn-northwest-1` +* Add China regions `cn-north-1` and `cn-northwest-1` 2.1.0 ===== -* RAID support -* EFS support -* AWS Batch Multinode Parallel Support +* Add configuration for RAID 0 and 1 volumes +* Add Elastic File System (EFS) support +* Add AWS Batch Multinode Parallel jobs support +* Add support for Stockholm region (`eu-north-1`) +* Add `--env` and `--env-blacklist` options to the `awsbsub` command to export environment variables + in the job environment +* Add `--input-file` option to the `awsbsub` command to stage-in input files from the client +* Add new `PCLUSTER_JOB_S3_URL` variable to the job execution environment pointing to the S3 URL used + for job data stage-in/out +* Add S3 URL for job data staging to the `awsbstat -d` output +* Add `--working-dir` and `--parent-working-dir` options to the `awsbsub` command to specify + the working-directory or the parent working directory for the job +* Add CPUs and Memory information to the `awsbhosts -d` command 2.0.2 ===== @@ -56,8 +106,11 @@ CHANGELOG 1.5.2 ===== * feature:``cfncluster``: Added ClusterUser as a stack output. This makes it easier to get the username of the head node. -* feature:``cfncluster``: Added `cfncluster ssh cluster_name`, this allows you to easily ssh into your clusters. It allows arbitrary command execution and extra ssh flags to be provided after the command. See https://aws-parallelcluster.readthedocs.io/en/latest/commands.html#ssh -* change:``cfncluster``: Moved global cli flags to the command specific flags. For example `cfncluster --region us-east-1 create` now becomes `cfncluster create --region us-east-1` +* feature:``cfncluster``: Added `cfncluster ssh cluster_name`, this allows you to easily ssh into your clusters. + It allows arbitrary command execution and extra ssh flags to be provided after the command. + See https://aws-parallelcluster.readthedocs.io/en/latest/commands.html#ssh +* change:``cfncluster``: Moved global cli flags to the command specific flags. + For example `cfncluster --region us-east-1 create` now becomes `cfncluster create --region us-east-1` * bugfix:``cfncluster-cookbook``: Fix bug that prevented c5d/m5d instances from working * bugfix:``cfncluster-cookbook``: Set CPU as a consumable resource in slurm * bugfix:``cfncluster-node``: Fixed Slurm behavior to add CPU slots so multiple jobs can be scheduled on a single node @@ -65,21 +118,19 @@ CHANGELOG 1.5.1 ===== * change:``cfncluster``: Added "ec2:DescribeVolumes" permissions to -CfnClusterInstancePolicy + CfnClusterInstancePolicy * change:``cfncluster``: Removed YAML CloudFormation template, it can be -generated by the https://github.com/awslabs/aws-cfn-template-flip tool + generated by the https://github.com/awslabs/aws-cfn-template-flip tool * updates:``cfncluster``: Add support for eu-west-3 region * feature:``cfncluster-cookbook``: Added parameter to specify custom -cfncluster-node package + cfncluster-node package * bugfix:``cfncluster``: Fix --template-url command line parameter * bugfix:``cfncluster-cookbook``: Poll on EBS Volume attachment status -* bugfix:``cfncluster-cookbook``: Fixed SLURM cron job to publish pending -metric -* bugfix:``cfncluster-node``: Fixed Torque behaviour when scaling up from an -empty cluster +* bugfix:``cfncluster-cookbook``: Fixed SLURM cron job to publish pending metric +* bugfix:``cfncluster-node``: Fixed Torque behaviour when scaling up from an empty cluster 1.4.2 @@ -87,10 +138,9 @@ empty cluster * bugfix:``cfncluster``: Fix crash when base directory for config file does not exist * bugfix:``cfncluster``: Removed extraneous logging message at - cfncluster invocation, re-enabled logging in - ~/.cfncluster/cfncluster-cli.log + cfncluster invocation, re-enabled logging in ~/.cfncluster/cfncluster-cli.log * bugfix: ``cfncluster-node``: Fix scaling issues with CentOS 6 clusters caused -by incompatible dependencies. + by incompatible dependencies. * updates:``ami``: Update all base AMIs to latest patch levels * updates:``cfncluster-cookbook``: Updated to cfncluster-cookbook-1.4.1 @@ -161,7 +211,7 @@ by incompatible dependencies. 1.0.0 ===== -Offiical release of the CfnCluster 1.x CLI, templates and AMIs. Available in all regions except BJS, with +Official release of the CfnCluster 1.x CLI, templates and AMIs. Available in all regions except BJS, with support for Amazon Linux, CentOS 6 & 7 and Ubuntu 14.04 LTS. All AMIs are built via packer from the CfnCluster Cookbook project (https://github.com/aws/aws-parallelcluster-cookbook). diff --git a/amis.txt b/amis.txt index fd5e824cfd..9ac60a52bd 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,100 @@ # alinux -ap-northeast-1: ami-0ac1d0c35dc3b2a97 -ap-northeast-2: ami-0dddb89f66485f828 -ap-northeast-3: ami-0dbb09bfada65298f -ap-south-1: ami-0ed1bb8f2ab0edffb -ap-southeast-1: ami-058bcd8377aba9bef -ap-southeast-2: ami-012557ce9426ef1a0 -ca-central-1: ami-059dda8ee9af7c20c -cn-north-1: ami-00237da8e056b0936 -cn-northwest-1: ami-0974fc483e449f5ee -eu-central-1: ami-09cff6787920e967c -eu-north-1: ami-086f4f382fb1119f0 -eu-west-1: ami-080a7d5c75253bd1b -eu-west-2: ami-0b284f1028a743865 -eu-west-3: ami-099197e40d0c5de6e -sa-east-1: ami-02e542e4935ff9647 -us-east-1: ami-0cd2dd3198972a68c -us-east-2: ami-057e70f0fbb007ab6 -us-gov-east-1: ami-022f96b137a63c9ff -us-gov-west-1: ami-fefb989f -us-west-1: ami-09686d4090e35a702 -us-west-2: ami-0c588cdc9e91b0db3 +ap-northeast-1: ami-01e7e307e734daa9b +ap-northeast-2: ami-096da9252851971b7 +ap-northeast-3: ami-0fde1cce0915721ec +ap-south-1: ami-0856113449f34cfde +ap-southeast-1: ami-019762c344b80574a +ap-southeast-2: ami-09ddff457fb3815ea +ca-central-1: ami-0ef3d34adf231688a +cn-north-1: ami-053a5a11a4ac83842 +cn-northwest-1: ami-0c5a99564e44467e7 +eu-central-1: ami-01d7252afc45b0d8b +eu-north-1: ami-02224ba7786413561 +eu-west-1: ami-0a32ae196621ce1cd +eu-west-2: ami-0e8d810df84d61f8b +eu-west-3: ami-09a1847f7683c6351 +sa-east-1: ami-03c4f6e67dcea925d +us-east-1: ami-096b5898281e68ea3 +us-east-2: ami-0c8b41c511db3c17c +us-gov-east-1: ami-08cff3f9ef830bdfb +us-gov-west-1: ami-afa0c8ce +us-west-1: ami-08e9806b160f9aa44 +us-west-2: ami-04eba5b9de0d94ec6 # centos6 -ap-northeast-1: ami-003cfe6266cadd576 -ap-northeast-2: ami-032b3f2a4f1ac91a0 -ap-northeast-3: ami-0c4de7aece7b2db33 -ap-south-1: ami-0581583c3d7507d9e -ap-southeast-1: ami-0d00309b80b772532 -ap-southeast-2: ami-00ecf7e455945e8bc -ca-central-1: ami-0416d41ed6dbc0bd7 -eu-central-1: ami-0b1f52047bb2b7f83 -eu-north-1: ami-04fc976e8108996e6 -eu-west-1: ami-073f1f5db6cfdd3d1 -eu-west-2: ami-0ace56d1d9c1aa466 -eu-west-3: ami-0d377bccde07c887d -sa-east-1: ami-0b1c8f6aad337d5b6 -us-east-1: ami-0919d912e0e33d247 -us-east-2: ami-0bbd43b2b8991cdae -us-west-1: ami-0ac1cf1e68288fa36 -us-west-2: ami-08b28682da5721f5b +ap-northeast-1: ami-09c0d7b7eba653962 +ap-northeast-2: ami-04bb0577f1425f61f +ap-northeast-3: ami-092c70a47ecb4f730 +ap-south-1: ami-0b6b4a5c5952ba214 +ap-southeast-1: ami-0d8fee73833faed87 +ap-southeast-2: ami-03e0e45d59cf130b3 +ca-central-1: ami-0f5cabbce86422cc0 +eu-central-1: ami-0c47523135cb69662 +eu-north-1: ami-064af23ad3c8bac88 +eu-west-1: ami-048edb0ed06f91ef8 +eu-west-2: ami-098b95acdef72fd6c +eu-west-3: ami-0e25f1a2ef5f8d237 +sa-east-1: ami-07bcda88df87d3db2 +us-east-1: ami-0fd0ae64eb509cc23 +us-east-2: ami-09c42f6a076482eeb +us-west-1: ami-060bcf74fb6278986 +us-west-2: ami-009c08111f57a2d4c # centos7 -ap-northeast-1: ami-0c0049e4eeb0ef1ac -ap-northeast-2: ami-00f8f46a043a04530 -ap-northeast-3: ami-0043e5e12872a00d4 -ap-south-1: ami-03dd063b05c3082f1 -ap-southeast-1: ami-00fcdc55bd29f691e -ap-southeast-2: ami-0baa1787fd7a71950 -ca-central-1: ami-0c2a8b54dfd0c405f -eu-central-1: ami-00b3f34240b6021dd -eu-north-1: ami-07ac74edc9e96b343 -eu-west-1: ami-0879d97613ba8075a -eu-west-2: ami-06aa0ca6f28c29fdb -eu-west-3: ami-0b28dcaf2b4d00d87 -sa-east-1: ami-00a570d9536621660 -us-east-1: ami-09d092c1b054832df -us-east-2: ami-0e0cda1cdec7fcecf -us-west-1: ami-084c065e503e74449 -us-west-2: ami-070345cb145f2d2f0 +ap-northeast-1: ami-0307ad022fdd9bd50 +ap-northeast-2: ami-01610b2d147974908 +ap-northeast-3: ami-04dd95194c8990d40 +ap-south-1: ami-0d848603d9bf30b76 +ap-southeast-1: ami-084b71a47f7c857b7 +ap-southeast-2: ami-0b06de8c266942eff +ca-central-1: ami-0a2b442fa56fe5db0 +eu-central-1: ami-089c27327ed56f33c +eu-north-1: ami-016fe6e29bff94e38 +eu-west-1: ami-00e9df6764697605f +eu-west-2: ami-00884b56887c9585a +eu-west-3: ami-0252465d6dff43cfb +sa-east-1: ami-02f4e6da1e0de2347 +us-east-1: ami-08b43bfe7a0d16eeb +us-east-2: ami-0c71e12e95cc937ef +us-west-1: ami-0f925f166f349eaa4 +us-west-2: ami-05f6a646767bf0c87 # ubuntu1404 -ap-northeast-1: ami-053a84278e34b7a59 -ap-northeast-2: ami-03d314c5ac10beb53 -ap-northeast-3: ami-070f9069b20a797ec -ap-south-1: ami-0641d7ca885530589 -ap-southeast-1: ami-0e3de99412375e882 -ap-southeast-2: ami-09eae4580e8fc835a -ca-central-1: ami-08aeb7a57f73b58ab -cn-north-1: ami-00f2cae5406fb3fce -eu-central-1: ami-0b24a435216670b4a -eu-north-1: ami-0921b515f8ed512c3 -eu-west-1: ami-076fbdec21cd5c940 -eu-west-2: ami-0fb01b5b56bc27509 -eu-west-3: ami-0ea8b79c622d0a4a2 -sa-east-1: ami-00cf3910c959e9fd5 -us-east-1: ami-095016c5ff0ab7ae6 -us-east-2: ami-02a21f90af8a453f0 -us-gov-east-1: ami-09274e2587c4154f2 -us-gov-west-1: ami-6cf0930d -us-west-1: ami-099ebeb456dbf0646 -us-west-2: ami-05ddc7ec0edb9069f +ap-northeast-1: ami-0141f9239dd88f7eb +ap-northeast-2: ami-069d8e1e9b74ee48d +ap-northeast-3: ami-07853be3b108fb14c +ap-south-1: ami-02744b942f4094210 +ap-southeast-1: ami-070fef5acc9fd3de2 +ap-southeast-2: ami-097c066ff342e12fa +ca-central-1: ami-075be3409c2c2738d +cn-north-1: ami-0e90250aa31ac4a12 +eu-central-1: ami-093d2529752e5e2f2 +eu-north-1: ami-01ea61e8b26299de1 +eu-west-1: ami-0d1cb938a6d6b693c +eu-west-2: ami-0487ce94890b5869b +eu-west-3: ami-01a321e226dbf081a +sa-east-1: ami-02366044e8d7372fa +us-east-1: ami-00f8fb8ed06007c1f +us-east-2: ami-0e23f662335e169a6 +us-gov-east-1: ami-0a34c5de0403642d2 +us-gov-west-1: ami-e4a3cb85 +us-west-1: ami-021201edbaab31f29 +us-west-2: ami-085bb9cfafd2e3b3a # ubuntu1604 -ap-northeast-1: ami-026669cfcef23b3de -ap-northeast-2: ami-048214d1413ed8462 -ap-northeast-3: ami-038900edb73cb9496 -ap-south-1: ami-0cc424ec58256ea88 -ap-southeast-1: ami-059ba95190db36590 -ap-southeast-2: ami-04df2433ab61d3f37 -ca-central-1: ami-039a1b0ada060b5ce -cn-north-1: ami-072046713a0458796 -eu-central-1: ami-0d816068d1164f4d2 -eu-north-1: ami-046c32486a9abf742 -eu-west-1: ami-0f641e63ebaf647b1 -eu-west-2: ami-067c1c0157477c166 -eu-west-3: ami-0102caf8c6ec0768a -sa-east-1: ami-0ef5c70aec338bcfb -us-east-1: ami-098c8e582ca818cff -us-east-2: ami-055279b0b09d12a71 -us-gov-east-1: ami-0b664ce8c427b77eb -us-gov-west-1: ami-5ef6953f -us-west-1: ami-0db5e85c0b1ce2c20 -us-west-2: ami-02393fa61ac61547a +ap-northeast-1: ami-00bacec2848062b6b +ap-northeast-2: ami-0a6edfea96ed6c9db +ap-northeast-3: ami-09eeb02948489e793 +ap-south-1: ami-0dcc47340ec3a0e45 +ap-southeast-1: ami-0771cf73e73259040 +ap-southeast-2: ami-0252c6d83fa35183c +ca-central-1: ami-090b3ee62a0b5412b +cn-north-1: ami-0b89a3058ea57b76f +eu-central-1: ami-0c7ccc7ec89bd0d75 +eu-north-1: ami-0675d81a659cb530e +eu-west-1: ami-010af7d7788dac778 +eu-west-2: ami-01290e9a5298ae134 +eu-west-3: ami-0996ed0efb2b8cc4a +sa-east-1: ami-0a3ef670136cc81ee +us-east-1: ami-05f79ab77fc13e20a +us-east-2: ami-08a675767416c627a +us-gov-east-1: ami-08b15aaa28124fac1 +us-gov-west-1: ami-eba3cb8a +us-west-1: ami-059b8cd1b0041071a +us-west-2: ami-034bffb8da06d6951 diff --git a/cli/.flake8 b/cli/.flake8 index f2c4c84977..26265ce3ca 100644 --- a/cli/.flake8 +++ b/cli/.flake8 @@ -9,13 +9,18 @@ ignore = # E402 module level import not at top of file # D101 Missing docstring in public class # D102 Missing docstring in public method +# D205 1 blank line required between summary line and description +# D400 First line should end with a period +# D401 First line should be in imperative mood per-file-ignores = pcluster/*.py: D103 - pcluster/config_sanity.py: E402 - pcluster/easyconfig.py: E402 - pcluster/cfnconfig.py: E402 - tests/pcluster/pcluster-unittest.py: D101, D102 + pcluster/config_sanity.py: E402, D103 + pcluster/easyconfig.py: E402, D103 + pcluster/cfnconfig.py: E402, D103 + tests/pcluster/pcluster-unittest.py: D101, D102, D103 tests/awsbatch/test_*.py: D101, D102 + ../tests/integration-tests/tests/*: D103 + ../tests/integration-tests/*: D205, D400, D401 exclude = .tox, .git, diff --git a/cli/awsbatch/awsbsub.py b/cli/awsbatch/awsbsub.py index cd2490ad2c..07a4e35ced 100644 --- a/cli/awsbatch/awsbsub.py +++ b/cli/awsbatch/awsbsub.py @@ -214,7 +214,7 @@ def _upload_and_get_command(boto3_factory, args, job_s3_folder, job_name, config # upload input files, if there if args.input_file: for file in args.input_file: - s3_uploader.put_file(file, file) + s3_uploader.put_file(file, os.path.basename(file)) # upload command, if needed if args.command_file or not sys.stdin.isatty() or args.env: diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index fd24e27bba..e4cd92a796 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -29,45 +29,665 @@ import pkg_resources from botocore.exceptions import ClientError -from . import config_sanity - - -def get_stack_template(region, aws_access_key_id, aws_secret_access_key, stack): - cfn = boto3.client( - "cloudformation", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - __stack_name = "parallelcluster-" + stack - - try: - __stack = cfn.describe_stacks(StackName=__stack_name).get("Stacks")[0] - except ClientError as e: - print(e.response.get("Error").get("Message")) - sys.stdout.flush() +from pcluster.config_sanity import ResourceValidator +from pcluster.utils import get_vcpus_from_pricing_file + + +class ParallelClusterConfig(object): + """Manage ParallelCluster Config.""" + + MAX_EBS_VOLUMES = 5 + + def __init__(self, args): + self.args = args + self.parameters = {} + self.version = pkg_resources.get_distribution("aws-parallelcluster").version + + # Initialize configuration attribute by parsing config file + self.__config = self.__init_config() + + # Initialize region and credentials public attributes + self.__init_region() + self.__init_credentials() + + # Get cluster template and define corresponding parameter + cluster_template = self.__get_cluster_template() + self.__cluster_section = "cluster %s" % cluster_template + self.parameters["CLITemplate"] = cluster_template + + # Check for update, if required, according to the configuration parameter + self.__check_for_updates() + + # Initialize sanity_check private attribute and ResourceValidator object + self.__init_sanity_check() + + # Initialize key name public attribute and corresponding parameter + self.__init_key_name() + + # Initialize template url public attribute + self.__init_template_url() + + # Validate VPC configuration settings and initialize corresponding parameters + self.__init_vpc_parameters() + + # Initialize Scheduler parameters + self.__init_scheduler_parameters() + + # Initialize parameters related to the cluster configuration + self.__init_cluster_parameters() + + # Verify Account limits + if self.__sanity_check: + self.__check_account_capacity() + + # Initialize ExtraJson parameter + self.__init_extra_json_parameter() + + # Initialize Tags public attribute + self.__init_tags() + + # Initialize EBS related parameters + self.__init_ebs_parameters() + + # Initialize EFS related parameters + self.__init_efs_parameters() + + # Initialize RAID related parameters + self.__init_raid_parameters() + + # Initialize FSx related parameters + self.__init_fsx_parameters() + + # Initialize scaling related parameters + self.__init_scaling_parameters() + + # Initialize aliases public attributes + self.__init_aliases() + + # Handle extra parameters supplied on command-line + try: + if self.args.extra_parameters is not None: + self.parameters.update(dict(self.args.extra_parameters)) + except AttributeError: + pass + + @staticmethod + def __fail(message): + """Print an error message and exit.""" + print("ERROR: {0}".format(message)) sys.exit(1) - __cli_template = [ - p.get("ParameterValue") for p in __stack.get("Parameters") if p.get("ParameterKey") == "CLITemplate" - ][0] - return __cli_template + def __init_config(self): + """ + Initialize configuration from file. + + :return: configuration object + """ + # Determine config file name based on args or default + if hasattr(self.args, "config_file") and self.args.config_file is not None: + config_file = self.args.config_file + default_config = False + else: + config_file = os.path.expanduser(os.path.join("~", ".parallelcluster", "config")) + default_config = True + + if not os.path.isfile(config_file): + if default_config: + self.__fail( + "Default config {0} not found.\nYou can copy a template from here: {1}{2}examples{2}config".format( + config_file, + os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))), + os.path.sep, + ) + ) + else: + self.__fail("Config file %s not found" % config_file) + + config = configparser.ConfigParser() + config.read(config_file) + return config + + def _get_config_value(self, section, key, default=None): + """ + Get configuration key value from the given section. + + :param section: Configuration file section + :param key: Configuration parameter key + :param default: default value to return if the option is not present in the configuration file + :return: Configuration parameter value, or if not found. + """ + try: + return self.__config.get(section, key) + except configparser.NoOptionError: + return default + + def __init_region(self): + """ + Initialize region attribute. + + Order is 1) CLI arg 2) AWS_DEFAULT_REGION env 3) Config file 4) us-east-1 + """ + if hasattr(self.args, "region") and self.args.region: + self.region = self.args.region + elif os.environ.get("AWS_DEFAULT_REGION"): + self.region = os.environ.get("AWS_DEFAULT_REGION") + else: + self.region = self._get_config_value("aws", "aws_region_name", "us-east-1") + + def __init_credentials(self): + """Init credentials by checking if they have been provided in config.""" + self.aws_access_key_id = self._get_config_value("aws", "aws_access_key_id") + self.aws_secret_access_key = self._get_config_value("aws", "aws_secret_access_key") + + def __get_stack_name(self): + """Return stack name.""" + return "parallelcluster-" + self.args.cluster_name + + def __get_stack_template(self): + """Get stack template.""" + cfn = boto3.client( + "cloudformation", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + + try: + stack = cfn.describe_stacks(StackName=self.__get_stack_name()).get("Stacks")[0] + except ClientError as e: + self.__fail(e.response.get("Error").get("Message")) + + cli_template = [ + p.get("ParameterValue") for p in stack.get("Parameters") if p.get("ParameterKey") == "CLITemplate" + ][0] + + return cli_template + + def __get_cluster_template(self): + """ + Determine which cluster template will be used and return it. + + :return: the cluster template to use + """ + args_func = self.args.func.__name__ + if args_func in ["start", "stop", "instances"]: + # Starting and stopping a cluster is unique in that we would want to prevent the + # customer from inadvertently using a different template than what + # the cluster was created with, so we do not support the -t + # parameter. We always get the template to use from CloudFormation. + cluster_template = self.__get_stack_template() + else: + try: + try: + if self.args.cluster_template is not None: + cluster_template = self.args.cluster_template + elif args_func == "update": + cluster_template = self.__get_stack_template() + else: + cluster_template = self.__config.get("global", "cluster_template") + except AttributeError: + cluster_template = self.__config.get("global", "cluster_template") + except configparser.NoOptionError: + self.__fail("Missing 'cluster_template' option in [global] section.") + + return cluster_template + + def __check_for_updates(self): + """Check for updates, if required.""" + # verify if package updates should be checked + try: + update_check = self.__config.getboolean("global", "update_check") + except configparser.NoOptionError: + update_check = True + + if update_check is True: + try: + latest = json.loads( + urllib.request.urlopen("https://pypi.python.org/pypi/aws-parallelcluster/json").read() + )["info"]["version"] + if self.version < latest: + print("warning: There is a newer version %s of AWS ParallelCluster available." % latest) + except Exception: + pass + + def __init_sanity_check(self): + """ + Check if config sanity should be run and initialize the corresponding attribute. + + The method also initializes the ResourceValidator object, to be used to validate the resources. + """ + try: + self.__sanity_check = self.__config.getboolean("global", "sanity_check") + self.__resource_validator = ResourceValidator( + self.region, self.aws_access_key_id, self.aws_secret_access_key + ) + + # Only check config on calls that mutate it + if self.args.func.__name__ not in ["create", "update", "configure"]: + self.__sanity_check = False + + except configparser.NoOptionError: + self.__sanity_check = False + + def __validate_resource(self, resource_type, resource_value): + """ + Validate the given resource, only if the sanity_check configuration parameter is set to true. + + :param resource_type: Resource type + :param resource_value: Resource value + :return True or False (for the EFSFSId resource type only), nothing in the other cases + """ + if self.__sanity_check: + self.__resource_validator.validate(resource_type, resource_value) + + def __init_key_name(self): + """Get the EC2 keypair name to be used and set the corresponding attribute and parameter, exit if not set.""" + try: + self.key_name = self.__config.get(self.__cluster_section, "key_name") + if not self.key_name: + self.__fail("key_name set in [%s] section but not defined." % self.__cluster_section) + self.__validate_resource("EC2KeyPair", self.key_name) + except configparser.NoOptionError: + self.__fail("Missing key_name option in [%s] section." % self.__cluster_section) + + self.parameters["KeyName"] = self.key_name + + def __init_template_url(self): + """ + Determine the CloudFormation URL to be used and initialize the corresponding attribute. + + Order is 1) CLI arg 2) Config file 3) default for version + region + """ + try: + if self.args.template_url is not None: + self.template_url = self.args.template_url + else: + try: + self.template_url = self.__config.get(self.__cluster_section, "template_url") + if not self.template_url: + self.__fail("template_url set in [%s] section but not defined." % self.__cluster_section) + self.__validate_resource("URL", self.template_url) + except configparser.NoOptionError: + s3_suffix = ".cn" if self.region.startswith("cn") else "" + self.template_url = ( + "https://s3.%s.amazonaws.com%s/%s-aws-parallelcluster/templates/" + "aws-parallelcluster-%s.cfn.json" % (self.region, s3_suffix, self.region, self.version) + ) + except AttributeError: + pass + + def __init_vpc_parameters(self): + """Initialize VPC Parameters.""" + # Determine which vpc settings section will be used + vpc_settings = self.__config.get(self.__cluster_section, "vpc_settings") + vpc_section = "vpc %s" % vpc_settings + + # Dictionary list of all VPC options + vpc_options = dict( + vpc_id=("VPCId", "VPC"), + master_subnet_id=("MasterSubnetId", "VPCSubnet"), + compute_subnet_cidr=("ComputeSubnetCidr", None), + compute_subnet_id=("ComputeSubnetId", "VPCSubnet"), + use_public_ips=("UsePublicIps", None), + ssh_from=("AccessFrom", None), + access_from=("AccessFrom", None), + additional_sg=("AdditionalSG", "VPCSecurityGroup"), + vpc_security_group_id=("VPCSecurityGroupId", "VPCSecurityGroup"), + ) + self.__master_subnet = self.__config.get(vpc_section, "master_subnet_id") + + # Loop over all VPC options and add define to parameters, raise Exception is defined but null + for key in vpc_options: + try: + __temp__ = self.__config.get(vpc_section, key) + if not __temp__: + self.__fail("%s defined but not set in [%s] section" % (key, vpc_section)) + if vpc_options.get(key)[1] is not None: + self.__validate_resource(vpc_options.get(key)[1], __temp__) + self.parameters[vpc_options.get(key)[0]] = __temp__ + except configparser.NoOptionError: + pass + except configparser.NoSectionError: + self.__fail( + "VPC section [%s] used in [%s] section is not defined" % (vpc_section, self.__cluster_section) + ) + + def __check_account_capacity(self): + """Try to launch the requested number of instances to verify Account limits.""" + test_ami_id = self.__get_latest_alinux_ami_id() + + instance_type = self.parameters.get("ComputeInstanceType", "t2.micro") + if instance_type == "optimal": + return + + max_size = self.__get_max_number_of_instances(instance_type) + try: + # Check for insufficient Account capacity + ec2 = boto3.client("ec2", region_name=self.region) + + subnet_id = self.parameters.get("ComputeSubnetId") + if subnet_id: + ec2.run_instances( + InstanceType=instance_type, + MinCount=max_size, + MaxCount=max_size, + ImageId=test_ami_id, + SubnetId=subnet_id, + DryRun=True, + ) + else: + ec2.run_instances( + InstanceType=instance_type, MinCount=max_size, MaxCount=max_size, ImageId=test_ami_id, DryRun=True + ) + except ClientError as e: + code = e.response.get("Error").get("Code") + message = e.response.get("Error").get("Message") + if code == "DryRunOperation": + pass + elif code == "InstanceLimitExceeded": + self.__fail( + "The configured max size parameter {0} exceeds the AWS Account limit " + "in the {1} region.\n{2}".format(max_size, self.region, message) + ) + elif code == "InsufficientInstanceCapacity": + self.__fail( + "The configured max size parameter {0} exceeds the On-Demand capacity on AWS.\n{1}".format( + max_size, message + ) + ) + elif code == "InsufficientFreeAddressesInSubnet": + self.__fail( + "The configured max size parameter {0} exceeds the number of free private IP addresses " + "available in the Compute subnet.\n{1}".format(max_size, message) + ) + else: + self.__fail( + "Unable to check AWS Account limits. Please double check your cluster configuration.\n%s" % message + ) + + def __get_max_number_of_instances(self, instance_type): + """ + Get the maximum number of requestable instances according to the scheduler type and other configuration params. + + :param instance_type The instance type to use in the awsbatch case + :return: the max number of instances requestable by the user + """ + try: + max_size = int(self.parameters.get("MaxSize")) + if self.parameters.get("Scheduler") == "awsbatch": + vcpus = get_vcpus_from_pricing_file(self.region, instance_type) + max_size = -(-max_size // vcpus) + except ValueError: + self.__fail("Unable to convert max size parameter to an integer") + return max_size + + def __get_latest_alinux_ami_id(self): + try: + # get latest alinux ami id to use as test image + ssm = boto3.client("ssm", region_name=self.region) + test_ami_id = ( + ssm.get_parameters_by_path(Path="/aws/service/ami-amazon-linux-latest") + .get("Parameters")[0] + .get("Value") + ) + except ClientError as e: + self.__fail("Unable to check AWS Account limits.\n%s" % e.response.get("Error").get("Message")) + return test_ami_id + + def __init_scheduler_parameters(self): + """Validate scheduler related configuration settings and initialize corresponding parameters.""" + # use sge as default scheduler + if self.__config.has_option(self.__cluster_section, "scheduler"): + self.parameters["Scheduler"] = self.__config.get(self.__cluster_section, "scheduler") + else: + self.parameters["Scheduler"] = "sge" + + # check for the scheduler since AWS Batch requires different configuration parameters + if self.parameters["Scheduler"] == "awsbatch": + self.__init_batch_parameters() + else: + self.__init_size_parameters() + + def __init_size_parameters(self): + """Initialize size parameters.""" + # Set defaults outside the cloudformation template + self.parameters["MinSize"] = "0" + self.parameters["DesiredSize"] = "2" + self.parameters["MaxSize"] = "10" + + size_parameters = OrderedDict( + initial_queue_size=("InitialQueueSize", None), + maintain_initial_size=("MaintainInitialSize", None), + max_queue_size=("MaxQueueSize", None), + ) + for key in size_parameters: + try: + __temp__ = self.__config.get(self.__cluster_section, key) + if not __temp__: + self.__fail("%s defined but not set in [%s] section" % (key, self.__cluster_section)) + if key == "initial_queue_size": + self.parameters["DesiredSize"] = __temp__ + elif key == "maintain_initial_size": + self.parameters["MinSize"] = self.parameters.get("DesiredSize") if __temp__ == "true" else "0" + elif key == "max_queue_size": + self.parameters["MaxSize"] = __temp__ + except configparser.NoOptionError: + pass + + def __init_cluster_parameters(self): + """Loop over all the cluster options and define parameters, raise Exception if defined but None.""" + cluster_options = dict( + cluster_user=("ClusterUser", None), + compute_instance_type=("ComputeInstanceType", None), + master_instance_type=("MasterInstanceType", None), + scheduler=("Scheduler", None), + cluster_type=("ClusterType", None), + ephemeral_dir=("EphemeralDir", None), + spot_price=("SpotPrice", None), + custom_ami=("CustomAMI", "EC2Ami"), + pre_install=("PreInstallScript", "URL"), + post_install=("PostInstallScript", "URL"), + proxy_server=("ProxyServer", None), + placement=("Placement", None), + placement_group=("PlacementGroup", "EC2PlacementGroup"), + encrypted_ephemeral=("EncryptedEphemeral", None), + pre_install_args=("PreInstallArgs", None), + post_install_args=("PostInstallArgs", None), + s3_read_resource=("S3ReadResource", None), + s3_read_write_resource=("S3ReadWriteResource", None), + tenancy=("Tenancy", None), + master_root_volume_size=("MasterRootVolumeSize", None), + compute_root_volume_size=("ComputeRootVolumeSize", None), + base_os=("BaseOS", None), + ec2_iam_role=("EC2IAMRoleName", "EC2IAMRoleName"), + extra_json=("ExtraJson", None), + custom_chef_cookbook=("CustomChefCookbook", None), + custom_chef_runlist=("CustomChefRunList", None), + additional_cfn_template=("AdditionalCfnTemplate", None), + custom_awsbatch_template_url=("CustomAWSBatchTemplateURL", None), + ) + for key in cluster_options: + try: + __temp__ = self.__config.get(self.__cluster_section, key) + if not __temp__: + self.__fail("%s defined but not set in [%s] section" % (key, self.__cluster_section)) + if cluster_options.get(key)[1] is not None: + self.__validate_resource(cluster_options.get(key)[1], __temp__) + self.parameters[cluster_options.get(key)[0]] = __temp__ + except configparser.NoOptionError: + pass + + def __init_extra_json_parameter(self): + """Check for extra_json = { "cluster" : ... } configuration parameters and map to "cfncluster".""" + extra_json = self.parameters.get("ExtraJson") + if extra_json: + extra_json = json.loads(extra_json) + if "cluster" in extra_json: + # support parallelcluster syntax by replacing the key + extra_json["cfncluster"] = extra_json.pop("cluster") + self.parameters["ExtraJson"] = json.dumps(extra_json) + + def __init_tags(self): + """ + Merge tags from config with tags from command line args. + + Command line args take precedent and overwrite tags supplied in the config. + """ + self.tags = {} + try: + tags = self.__config.get(self.__cluster_section, "tags") + self.tags = json.loads(tags) + except configparser.NoOptionError: + pass + try: + if self.args.tags is not None: + for key in self.args.tags: + self.tags[key] = self.args.tags[key] + except AttributeError: + pass + + def __init_scaling_parameters(self): # noqa: C901 FIXME!!! + """Initialize scaling related parameters.""" + # Determine if scaling settings are defined and set section. + try: + self.__scaling_settings = self.__config.get(self.__cluster_section, "scaling_settings") + if not self.__scaling_settings: + self.__fail("scaling_settings defined by not set in [%s] section" % self.__cluster_section) + scaling_section = "scaling %s" % self.__scaling_settings + except configparser.NoOptionError: + scaling_section = None + + if scaling_section: + # Dictionary list of all scaling options + scaling_options = dict(scaledown_idletime=("ScaleDownIdleTime", None)) + for key in scaling_options: + try: + __temp__ = self.__config.get(scaling_section, key) + if not __temp__: + self.__fail("%s defined but not set in [%s] section" % (key, scaling_section)) + if scaling_options.get(key)[1] is not None: + self.__validate_resource(scaling_options.get(key)[1], __temp__) + self.parameters[scaling_options.get(key)[0]] = __temp__ + except configparser.NoOptionError: + pass + + def __init_aliases(self): + """Initialize aliases attributes according to the configuration.""" + self.aliases = {} + alias_section = "aliases" + if self.__config.has_section(alias_section): + for alias in self.__config.options(alias_section): + self.aliases[alias] = self.__config.get(alias_section, alias) + + def __check_option_absent_awsbatch(self, option): + if self.__config.has_option(self.__cluster_section, option): + self.__fail("option %s cannot be used with awsbatch" % option) + + def __validate_awsbatch_os(self, baseos): + supported_batch_oses = ["alinux"] + if baseos not in supported_batch_oses: + self.__fail("awsbatch scheduler supports following OSes: %s" % supported_batch_oses) + + def __init_batch_parameters(self): # noqa: C901 FIXME!!! + """ + Initialize Batch specific parameters. + + :param config: configuration object. + """ + self.__check_option_absent_awsbatch("initial_queue_size") + self.__check_option_absent_awsbatch("maintain_initial_size") + self.__check_option_absent_awsbatch("max_queue_size") + self.__check_option_absent_awsbatch("spot_price") + + if self.__config.has_option(self.__cluster_section, "base_os"): + self.__validate_awsbatch_os(self.__config.get(self.__cluster_section, "base_os")) + + if self.__config.has_option(self.__cluster_section, "compute_instance_type"): + compute_instance_type = self.__config.get(self.__cluster_section, "compute_instance_type") + self.parameters["ComputeInstanceType"] = compute_instance_type + else: + # use 'optimal' as default for awsbatch + self.parameters["ComputeInstanceType"] = "optimal" + + if self.__config.has_option(self.__cluster_section, "spot_bid_percentage"): + spot_bid_percentage = self.__config.get(self.__cluster_section, "spot_bid_percentage") + # use spot price to indicate spot bid percentage in case of awsbatch + self.parameters["SpotPrice"] = spot_bid_percentage + + if self.__config.has_option(self.__cluster_section, "custom_awsbatch_template_url"): + awsbatch_custom_url = self.__config.get(self.__cluster_section, "custom_awsbatch_template_url") + if not awsbatch_custom_url: + self.__fail( + "custom_awsbatch_template_url set in [%s] section but not defined." % self.__cluster_section + ) + self.parameters["CustomAWSBatchTemplateURL"] = awsbatch_custom_url + + # Set batch default size parameters + self.parameters["MinSize"] = "0" + self.parameters["DesiredSize"] = "4" + self.parameters["MaxSize"] = "20" + + # Override those parameters from config if they are available + batch_size_parameters = dict( + min_vcpus=("MinVCpus", None), desired_vcpus=("DesiredVCpus", None), max_vcpus=("MaxVCpus", None) + ) + for key in batch_size_parameters: + try: + __temp__ = self.__config.get(self.__cluster_section, key) + if not __temp__: + self.__fail("%s defined but not set in [%s] section" % (key, self.__cluster_section)) + if key == "min_vcpus": + self.parameters["MinSize"] = __temp__ + elif key == "desired_vcpus": + self.parameters["DesiredSize"] = __temp__ + elif key == "max_vcpus": + self.parameters["MaxSize"] = __temp__ + except configparser.NoOptionError: + pass + self.__validate_resource("AWSBatch_Parameters", self.parameters) -class ParallelClusterConfig(object): - """Manage ParallelCluster Config.""" + def __get_section_name(self, parameter_name, section): + """ + Validate a section referenced in the cluster section exists, and returns the name of section. - def __get_efs_parameters(self, __config): # noqa: C901 FIXME!!! - # Determine if EFS settings are defined and set section + :param parameter_name: name of the parameter that references the section, i.e. "fsx_settings" + :param section: name of the section, i.e. "fsx" + :return: Full name of the section, if it exists, else None + """ try: - self.__efs_settings = __config.get(self.__cluster_section, "efs_settings") - if not self.__efs_settings: - print("ERROR: efs_settings defined but not set in [%s] section" % self.__cluster_section) - sys.exit(1) - self.__efs_section = "efs %s" % self.__efs_settings + section_name = self.__config.get(self.__cluster_section, parameter_name) + if not section_name: + self.__fail("%s defined but not set in [%s] section" % (parameter_name, self.__cluster_section)) + subsection = "%s %s" % (section, section_name) + if self.__config.has_section(subsection): + return subsection + else: + self.__fail("%s = %s defined but no [%s] section found" % (parameter_name, section_name, subsection)) except configparser.NoOptionError: pass + return None + + def __get_option_in_section(self, section, key): + """ + Get an option in a section, if not present return None. + + :param section: name of section, i.e. "fsx fs" + :param key: name of option, i.e. "shared_dir" + :return: value if set, otherwise None + """ + try: + value = self.__config.get(section, key) + if not value: + self.__fail("%s defined but not set in [%s] section" % (key, section)) + return value + except configparser.NoOptionError: + return None + + def __init_efs_parameters(self): # noqa: C901 FIXME!!! + efs_section = self.__get_section_name("efs_settings", "efs") + # Dictionary list of all EFS options self.__efs_options = OrderedDict( [ @@ -84,49 +704,30 @@ def __get_efs_parameters(self, __config): # noqa: C901 FIXME!!! __throughput_mode = None __provisioned_throughput = None try: - if self.__efs_section: + if efs_section: __temp_efs_options = [] for key in self.__efs_options: try: - __temp__ = __config.get(self.__efs_section, key) + __temp__ = self.__config.get(efs_section, key) if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__efs_section)) - sys.exit(1) + self.__fail("%s defined but not set in [%s] section" % (key, self.__efs_section)) if key == "provisioned_throughput": __provisioned_throughput = __temp__ elif key == "throughput_mode": __throughput_mode = __temp__ # Separate sanity_check for fs_id, need to pass in fs_id and subnet_id - if self.__sanity_check and self.__efs_options.get(key)[1] == "EFSFSId": - __valid_mt = config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - "EFSFSId", - (__temp__, self.__master_subnet), - ) - elif self.__sanity_check and self.__efs_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.__efs_options.get(key)[1], - __temp__, - ) + if self.__efs_options.get(key)[1] == "EFSFSId": + self.__validate_resource("EFSFSId", (__temp__, self.__master_subnet)) + __valid_mt = True + elif self.__efs_options.get(key)[1] is not None: + self.__validate_resource(self.__efs_options.get(key)[1], __temp__) __temp_efs_options.append(__temp__) except configparser.NoOptionError: __temp_efs_options.append("NONE") - pass # Separate sanity_check for throughput settings, # need to pass in throughput_mode and provisioned_throughput - if self.__sanity_check and (__provisioned_throughput is not None or __throughput_mode is not None): - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - "EFSThroughput", - (__throughput_mode, __provisioned_throughput), - ) + if __provisioned_throughput is not None or __throughput_mode is not None: + self.__validate_resource("EFSThroughput", (__throughput_mode, __provisioned_throughput)) if __valid_mt: __temp_efs_options.append("Valid") else: @@ -135,16 +736,8 @@ def __get_efs_parameters(self, __config): # noqa: C901 FIXME!!! except AttributeError: pass - def __get_raid_parameters(self, __config): # noqa: C901 FIXME!!! - # Determine if RAID settings are defined and set section - try: - self.__raid_settings = __config.get(self.__cluster_section, "raid_settings") - if not self.__raid_settings: - print("ERROR: raid_settings defined by not set in [%s] section" % self.__cluster_section) - sys.exit(1) - self.__raid_section = "raid %s" % self.__raid_settings - except configparser.NoOptionError: - pass + def __init_raid_parameters(self): # noqa: C901 FIXME!!! + raid_settings = self.__get_section_name("raid_settings", "raid") # Dictionary list of all RAID options self.__raid_options = OrderedDict( @@ -161,7 +754,7 @@ def __get_raid_parameters(self, __config): # noqa: C901 FIXME!!! ) try: - if self.__raid_section: + if raid_settings: __temp_raid_options = [] __raid_shared_dir = None __raid_vol_size = None @@ -169,10 +762,9 @@ def __get_raid_parameters(self, __config): # noqa: C901 FIXME!!! __raid_type = None for key in self.__raid_options: try: - __temp__ = __config.get(self.__raid_section, key) + __temp__ = self.__config.get(raid_settings, key) if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__raid_section)) - sys.exit(1) + self.__fail("%s defined but not set in [%s] section" % (key, self.__raid_section)) if key == "volume_size": __raid_vol_size = __temp__ elif key == "volume_iops": @@ -181,14 +773,8 @@ def __get_raid_parameters(self, __config): # noqa: C901 FIXME!!! __raid_shared_dir = __temp__ elif key == "raid_type": __raid_type = __temp__ - if self.__sanity_check and self.__raid_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.__raid_options.get(key)[1], - __temp__, - ) + if self.__raid_options.get(key)[1] is not None: + self.__validate_resource(self.__raid_options.get(key)[1], __temp__) __temp_raid_options.append(__temp__) except configparser.NoOptionError: if key == "num_of_raid_volumes": @@ -198,47 +784,68 @@ def __get_raid_parameters(self, __config): # noqa: C901 FIXME!!! pass if __raid_iops is not None: if __raid_vol_size is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - "RAIDIOPS", - (__raid_iops, __raid_vol_size), - ) + self.__validate_resource("RAIDIOPS", (__raid_iops, __raid_vol_size)) # If volume_size is not specified, check IOPS against default volume size, 20GB else: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - "RAIDIOPS", - (__raid_iops, 20), - ) + self.__validate_resource("RAIDIOPS", (__raid_iops, 20)) if __raid_type is None and __raid_shared_dir is not None: - print("ERROR: raid_type (0 or 1) is required in order to create RAID array.") - sys.exit(1) + self.__fail("raid_type (0 or 1) is required in order to create RAID array.") self.parameters["RAIDOptions"] = ",".join(__temp_raid_options) except AttributeError: pass - def __ebs_determine_shared_dir(self, __config): # noqa: C901 FIXME!!! + def __init_fsx_parameters(self): + # Determine if FSx settings are defined and set section + fsx_section = self.__get_section_name("fsx_settings", "fsx") + + # If they don't use fsx_settings, then return + if not fsx_section: + return + + # Dictionary list of all FSx options + fsx_options = OrderedDict( + [ + ("shared_dir", ("FSXShared_dir", None)), + ("fsx_fs_id", ("FSXFileSystemId", "fsx_fs_id")), + ("storage_capacity", ("FSXCapacity", "FSx_storage_capacity")), + ("fsx_kms_key_id", ("FSXKMSKeyId", None)), + ("imported_file_chunk_size", ("ImportedFileChunkSize", "FSx_imported_file_chunk_size")), + ("export_path", ("ExportPath", None)), + ("import_path", ("ImportPath", None)), + ("weekly_maintenance_start_time", ("WeeklyMaintenanceStartTime", None)), + ] + ) + + temp_fsx_options = [] + for key in fsx_options: + value = self.__get_option_in_section(fsx_section, key) + if not value: + temp_fsx_options.append("NONE") + else: + # Separate sanity_check for fs_id, need to pass in fs_id and subnet_id + if self.__sanity_check and fsx_options.get(key)[1] == "fsx_fs_id": + self.__validate_resource("fsx_fs_id", (value, self.__master_subnet)) + elif self.__sanity_check and fsx_options.get(key)[1] is not None: + self.__validate_resource(fsx_options.get(key)[1], value) + temp_fsx_options.append(value) + self.parameters["FSXOptions"] = ",".join(temp_fsx_options) + + def __ebs_determine_shared_dir(self): # noqa: C901 FIXME!!! # Handle the shared_dir under EBS setting sections __temp_dir_list = [] try: if self.__ebs_section: for section in self.__ebs_section: try: - __temp_shared_dir = __config.get(section, "shared_dir") + __temp_shared_dir = self.__config.get(section, "shared_dir") if not __temp_shared_dir: - print("ERROR: shared_dir defined but not set in [%s] section" % section) - sys.exit(1) + self.__fail("shared_dir defined but not set in [%s] section" % section) __temp_dir_list.append(__temp_shared_dir) except configparser.NoOptionError: pass except configparser.NoSectionError: - print("ERROR: [%s] section defined in ebs_settings does not exist" % section) - sys.exit(1) + self.__fail("[%s] section defined in ebs_settings does not exist" % section) except AttributeError: pass @@ -251,45 +858,41 @@ def __ebs_determine_shared_dir(self, __config): # noqa: C901 FIXME!!! # For backwards compatibility with just 1 volume explicitly specified through ebs_settings elif len(self.__ebs_section) == 1: try: - __temp_shared_dir = __config.get(self.__cluster_section, "shared_dir") + __temp_shared_dir = self.__config.get(self.__cluster_section, "shared_dir") if not __temp_shared_dir: - print("ERROR: shared_dir defined but not set") - sys.exit(1) + self.__fail("shared_dir defined but not set") self.parameters["SharedDir"] = __temp_shared_dir except configparser.NoOptionError: pass else: - print( - "ERROR: not enough shared directories provided.\n" + self.__fail( + "not enough shared directories provided.\n" "When using multiple EBS Volumes, please specify a shared_dir under each [ebs] section" ) - sys.exit(1) except AttributeError: try: - __temp_shared_dir = __config.get(self.__cluster_section, "shared_dir") + __temp_shared_dir = self.__config.get(self.__cluster_section, "shared_dir") if not __temp_shared_dir: - print("ERROR: shared_dir defined but not set") + print("shared_dir defined but not set") sys.exit(1) self.parameters["SharedDir"] = __temp_shared_dir except configparser.NoOptionError: pass - def __load_ebs_options(self, __config): # noqa: C901 FIXME!!! + def __init_ebs_parameters(self): # noqa: C901 FIXME!!! try: - self.__ebs_settings = __config.get(self.__cluster_section, "ebs_settings") + self.__ebs_settings = self.__config.get(self.__cluster_section, "ebs_settings") if not self.__ebs_settings: - print("ERROR: ebs_settings defined by not set in [%s] section" % self.__cluster_section) - sys.exit(1) + self.__fail("ebs_settings defined by not set in [%s] section" % self.__cluster_section) # Modify list self.__ebs_section = self.__ebs_settings.split(",") - if len(self.__ebs_section) > self.__MAX_EBS_VOLUMES: - print( - "ERROR: number of EBS volumes requested is greater than the MAX.\n" - "Max number of EBS volumes supported is currently %s" % self.__MAX_EBS_VOLUMES + if len(self.__ebs_section) > self.MAX_EBS_VOLUMES: + self.__fail( + "number of EBS volumes requested is greater than the MAX.\n" + "Max number of EBS volumes supported is currently %s" % self.MAX_EBS_VOLUMES ) - sys.exit(1) self.parameters["NumberOfEBSVol"] = "%s" % len(self.__ebs_section) for i, item in enumerate(self.__ebs_section): item = "ebs %s" % item.strip() @@ -297,7 +900,7 @@ def __load_ebs_options(self, __config): # noqa: C901 FIXME!!! except configparser.NoOptionError: pass - self.__ebs_determine_shared_dir(__config) + self.__ebs_determine_shared_dir() # Dictionary list of all EBS options self.__ebs_options = dict( @@ -316,466 +919,18 @@ def __load_ebs_options(self, __config): # noqa: C901 FIXME!!! __temp_parameter_list = [] for section in self.__ebs_section: try: - __temp__ = __config.get(section, key) + __temp__ = self.__config.get(section, key) if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, section)) - sys.exit(1) - if self.__sanity_check and self.__ebs_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.__ebs_options.get(key)[1], - __temp__, - ) + self.__fail("%s defined but not set in [%s] section" % (key, section)) + if self.__ebs_options.get(key)[1] is not None: + self.__validate_resource(self.__ebs_options.get(key)[1], __temp__) __temp_parameter_list.append(__temp__) except configparser.NoOptionError: __temp_parameter_list.append("NONE") - pass # Fill the rest of the parameter with NONE - while len(__temp_parameter_list) < self.__MAX_EBS_VOLUMES: + while len(__temp_parameter_list) < self.MAX_EBS_VOLUMES: __temp_parameter_list.append("NONE") self.parameters[self.__ebs_options.get(key)[0]] = ",".join(x for x in __temp_parameter_list) except AttributeError: pass - - def __init__(self, args): # noqa: C901 FIXME!!! - self.args = args - self.cluster_options = self.__init_cluster_options() - self.size_parameters = self.__init_size_parameters() - self.batch_size_parameters = self.__init_batch_size_parameters() - self.parameters = {} - self.version = pkg_resources.get_distribution("aws-parallelcluster").version - self.__DEFAULT_CONFIG = False - self.__MAX_EBS_VOLUMES = 5 - __args_func = self.args.func.__name__ - - # Determine config file name based on args or default - if hasattr(args, "config_file") and args.config_file is not None: - self.__config_file = args.config_file - else: - self.__config_file = os.path.expanduser(os.path.join("~", ".parallelcluster", "config")) - self.__DEFAULT_CONFIG = True - if os.path.isfile(self.__config_file): - pass - else: - if self.__DEFAULT_CONFIG: - print("Default config %s not found" % self.__config_file) - print( - "You can copy a template from here: %s%sexamples%sconfig" - % ( - os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))), - os.path.sep, - os.path.sep, - ) - ) - sys.exit(1) - else: - print("Config file %s not found" % self.__config_file) - sys.exit(1) - - __config = configparser.ConfigParser() - __config.read(self.__config_file) - - # Determine the EC2 region to used used or default to us-east-1 - # Order is 1) CLI arg 2) AWS_DEFAULT_REGION env 3) Config file 4) us-east-1 - if hasattr(args, "region") and args.region: - self.region = args.region - else: - if os.environ.get("AWS_DEFAULT_REGION"): - self.region = os.environ.get("AWS_DEFAULT_REGION") - else: - try: - self.region = __config.get("aws", "aws_region_name") - except configparser.NoOptionError: - self.region = "us-east-1" - - # Check if credentials have been provided in config - try: - self.aws_access_key_id = __config.get("aws", "aws_access_key_id") - except configparser.NoOptionError: - self.aws_access_key_id = None - try: - self.aws_secret_access_key = __config.get("aws", "aws_secret_access_key") - except configparser.NoOptionError: - self.aws_secret_access_key = None - - # Determine which cluster template will be used - if __args_func in ["start", "stop", "instances"]: - # Starting and stopping a cluster is unique in that we would want to prevent the - # customer from inadvertently using a different template than what - # the cluster was created with, so we do not support the -t - # parameter. We always get the template to use from CloudFormation. - self.__cluster_template = get_stack_template( - self.region, self.aws_access_key_id, self.aws_secret_access_key, self.args.cluster_name - ) - else: - try: - if args.cluster_template is not None: - self.__cluster_template = args.cluster_template - else: - if __args_func == "update": - self.__cluster_template = get_stack_template( - self.region, self.aws_access_key_id, self.aws_secret_access_key, self.args.cluster_name - ) - else: - self.__cluster_template = __config.get("global", "cluster_template") - except AttributeError: - self.__cluster_template = __config.get("global", "cluster_template") - self.__cluster_section = "cluster %s" % self.__cluster_template - self.parameters["CLITemplate"] = self.__cluster_template - - # Check if package updates should be checked - try: - self.__update_check = __config.getboolean("global", "update_check") - except configparser.NoOptionError: - self.__update_check = True - - if self.__update_check is True: - try: - __latest = json.loads( - urllib.request.urlopen("http://pypi.python.org/pypi/aws-parallelcluster/json").read() - )["info"]["version"] - if self.version < __latest: - print("warning: There is a newer version %s of AWS ParallelCluster available." % __latest) - except Exception: - pass - - # Check if config sanity should be run - try: - self.__sanity_check = __config.getboolean("global", "sanity_check") - except configparser.NoOptionError: - self.__sanity_check = False - # Only check config on calls that mutate it - __args_func = self.args.func.__name__ - if ( - __args_func == "create" or __args_func == "update" or __args_func == "configure" - ) and self.__sanity_check is True: - pass - else: - self.__sanity_check = False - - # Get the EC2 keypair name to be used, exit if not set - try: - self.key_name = __config.get(self.__cluster_section, "key_name") - if not self.key_name: - print("ERROR: key_name set in [%s] section but not defined." % self.__cluster_section) - sys.exit(1) - if self.__sanity_check: - config_sanity.check_resource( - self.region, self.aws_access_key_id, self.aws_secret_access_key, "EC2KeyPair", self.key_name - ) - except configparser.NoOptionError: - print("ERROR: Missing key_name option in [%s] section." % self.__cluster_section) - sys.exit(1) - self.parameters["KeyName"] = self.key_name - - # Determine the CloudFormation URL to be used - # Order is 1) CLI arg 2) Config file 3) default for version + region - try: - if args.template_url is not None: - self.template_url = args.template_url - else: - try: - self.template_url = __config.get(self.__cluster_section, "template_url") - if not self.template_url: - print("ERROR: template_url set in [%s] section but not defined." % self.__cluster_section) - sys.exit(1) - if self.__sanity_check: - config_sanity.check_resource( - self.region, self.aws_access_key_id, self.aws_secret_access_key, "URL", self.template_url - ) - except configparser.NoOptionError: - s3_suffix = ".cn" if self.region.startswith("cn") else "" - self.template_url = ( - "https://s3.%s.amazonaws.com%s/%s-aws-parallelcluster/templates/" - "aws-parallelcluster-%s.cfn.json" % (self.region, s3_suffix, self.region, self.version) - ) - - except AttributeError: - pass - - # Determine which vpc settings section will be used - self.__vpc_settings = __config.get(self.__cluster_section, "vpc_settings") - self.__vpc_section = "vpc %s" % self.__vpc_settings - - # Dictionary list of all VPC options - self.__vpc_options = dict( - vpc_id=("VPCId", "VPC"), - master_subnet_id=("MasterSubnetId", "VPCSubnet"), - compute_subnet_cidr=("ComputeSubnetCidr", None), - compute_subnet_id=("ComputeSubnetId", "VPCSubnet"), - use_public_ips=("UsePublicIps", None), - ssh_from=("AccessFrom", None), - access_from=("AccessFrom", None), - additional_sg=("AdditionalSG", "VPCSecurityGroup"), - vpc_security_group_id=("VPCSecurityGroupId", "VPCSecurityGroup"), - ) - - self.__master_subnet = __config.get(self.__vpc_section, "master_subnet_id") - # Loop over all VPC options and add define to parameters, raise Exception is defined but null - for key in self.__vpc_options: - try: - __temp__ = __config.get(self.__vpc_section, key) - if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__vpc_section)) - sys.exit(1) - if self.__sanity_check and self.__vpc_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.__vpc_options.get(key)[1], - __temp__, - ) - self.parameters[self.__vpc_options.get(key)[0]] = __temp__ - except configparser.NoOptionError: - pass - except configparser.NoSectionError: - print( - "ERROR: VPC section [%s] used in [%s] section is not defined" - % (self.__vpc_section, self.__cluster_section) - ) - sys.exit(1) - - if __config.has_option(self.__cluster_section, "scheduler"): - self.parameters["Scheduler"] = __config.get(self.__cluster_section, "scheduler") - else: - self.parameters["Scheduler"] = "sge" - - # Validate region for batch - if self.parameters["Scheduler"] == "awsbatch": - self.__run_batch_validation(__config) - else: - # Set defaults outside the cloudformation template - self.parameters["MinSize"] = "0" - self.parameters["DesiredSize"] = "2" - self.parameters["MaxSize"] = "10" - for key in self.size_parameters: - try: - __temp__ = __config.get(self.__cluster_section, key) - if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__cluster_section)) - sys.exit(1) - if key == "initial_queue_size": - self.parameters["DesiredSize"] = __temp__ - elif key == "maintain_initial_size": - self.parameters["MinSize"] = self.parameters.get("DesiredSize") if __temp__ == "true" else "0" - elif key == "max_queue_size": - self.parameters["MaxSize"] = __temp__ - except configparser.NoOptionError: - pass - - # Loop over all the cluster options and add define to parameters, raise Exception if defined but null - for key in self.cluster_options: - try: - __temp__ = __config.get(self.__cluster_section, key) - if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__cluster_section)) - sys.exit(1) - if self.__sanity_check and self.cluster_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.cluster_options.get(key)[1], - __temp__, - ) - self.parameters[self.cluster_options.get(key)[0]] = __temp__ - except configparser.NoOptionError: - pass - - # check for extra_json = { "cluster" : ... } configuration parameters and map to "cfncluster" - extra_json = self.parameters.get("ExtraJson") - if extra_json: - extra_json = json.loads(extra_json) - if "cluster" in extra_json: - # support parallelcluster syntax by replacing the key - extra_json["cfncluster"] = extra_json.pop("cluster") - self.parameters["ExtraJson"] = json.dumps(extra_json) - - # Merge tags from config with tags from command line args - # Command line args take precedent and overwite tags supplied in the config - self.tags = {} - try: - tags = __config.get(self.__cluster_section, "tags") - self.tags = json.loads(tags) - except configparser.NoOptionError: - pass - try: - if args.tags is not None: - for key in args.tags: - self.tags[key] = args.tags[key] - except AttributeError: - pass - - # Initialize EBS related options - self.__load_ebs_options(__config) - - # Initialize EFS related options - self.__get_efs_parameters(__config) - - # Parse RAID related options - self.__get_raid_parameters(__config) - - # Determine if scaling settings are defined and set section - try: - self.__scaling_settings = __config.get(self.__cluster_section, "scaling_settings") - if not self.__scaling_settings: - print("ERROR: scaling_settings defined by not set in [%s] section" % self.__cluster_section) - sys.exit(1) - self.__scaling_section = "scaling %s" % self.__scaling_settings - except configparser.NoOptionError: - pass - - # Dictionary list of all scaling options - self.__scaling_options = dict(scaledown_idletime=("ScaleDownIdleTime", None)) - - try: - if self.__scaling_section: - for key in self.__scaling_options: - try: - __temp__ = __config.get(self.__scaling_section, key) - if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__scaling_section)) - sys.exit(1) - if self.__sanity_check and self.__scaling_options.get(key)[1] is not None: - config_sanity.check_resource( - self.region, - self.aws_access_key_id, - self.aws_secret_access_key, - self.__scaling_options.get(key)[1], - __temp__, - ) - self.parameters[self.__scaling_options.get(key)[0]] = __temp__ - except configparser.NoOptionError: - pass - except AttributeError: - pass - - # handle aliases - self.aliases = {} - self.__alias_section = "aliases" - if __config.has_section(self.__alias_section): - for alias in __config.options(self.__alias_section): - self.aliases[alias] = __config.get(self.__alias_section, alias) - - # Handle extra parameters supplied on command-line - try: - if self.args.extra_parameters is not None: - self.parameters.update(dict(self.args.extra_parameters)) - except AttributeError: - pass - - @staticmethod - def __init_size_parameters(): - return OrderedDict( - initial_queue_size=("InitialQueueSize", None), - maintain_initial_size=("MaintainInitialSize", None), - max_queue_size=("MaxQueueSize", None), - ) - - @staticmethod - def __init_batch_size_parameters(): - return dict(min_vcpus=("MinVCpus", None), desired_vcpus=("DesiredVCpus", None), max_vcpus=("MaxVCpus", None)) - - @staticmethod - def __init_cluster_options(): - return dict( - cluster_user=("ClusterUser", None), - compute_instance_type=("ComputeInstanceType", None), - master_instance_type=("MasterInstanceType", None), - scheduler=("Scheduler", None), - cluster_type=("ClusterType", None), - ephemeral_dir=("EphemeralDir", None), - spot_price=("SpotPrice", None), - custom_ami=("CustomAMI", "EC2Ami"), - pre_install=("PreInstallScript", "URL"), - post_install=("PostInstallScript", "URL"), - proxy_server=("ProxyServer", None), - placement=("Placement", None), - placement_group=("PlacementGroup", "EC2PlacementGroup"), - encrypted_ephemeral=("EncryptedEphemeral", None), - pre_install_args=("PreInstallArgs", None), - post_install_args=("PostInstallArgs", None), - s3_read_resource=("S3ReadResource", None), - s3_read_write_resource=("S3ReadWriteResource", None), - tenancy=("Tenancy", None), - master_root_volume_size=("MasterRootVolumeSize", None), - compute_root_volume_size=("ComputeRootVolumeSize", None), - base_os=("BaseOS", None), - ec2_iam_role=("EC2IAMRoleName", "EC2IAMRoleName"), - extra_json=("ExtraJson", None), - custom_chef_cookbook=("CustomChefCookbook", None), - custom_chef_runlist=("CustomChefRunList", None), - additional_cfn_template=("AdditionalCfnTemplate", None), - custom_awsbatch_template_url=("CustomAWSBatchTemplateURL", None), - ) - - def __check_option_absent_awsbatch(self, config, option): - if config.has_option(self.__cluster_section, option): - print("ERROR: option %s cannot be used with awsbatch" % option) - sys.exit(1) - - def __validate_awsbatch_os(self, baseos): - supported_batch_oses = ["alinux"] - if baseos not in supported_batch_oses: - print("ERROR: awsbatch scheduler supports following OSes: %s" % supported_batch_oses) - sys.exit(1) - - def __run_batch_validation(self, config): # noqa: C901 FIXME!!! - self.__check_option_absent_awsbatch(config, "initial_queue_size") - self.__check_option_absent_awsbatch(config, "maintain_initial_size") - self.__check_option_absent_awsbatch(config, "max_queue_size") - self.__check_option_absent_awsbatch(config, "spot_price") - - if config.has_option(self.__cluster_section, "base_os"): - self.__validate_awsbatch_os(config.get(self.__cluster_section, "base_os")) - - if config.has_option(self.__cluster_section, "compute_instance_type"): - compute_instance_type = config.get(self.__cluster_section, "compute_instance_type") - self.parameters["ComputeInstanceType"] = compute_instance_type - else: - # use 'optimal' as default for awsbatch - self.parameters["ComputeInstanceType"] = "optimal" - - if config.has_option(self.__cluster_section, "spot_bid_percentage"): - spot_bid_percentage = config.get(self.__cluster_section, "spot_bid_percentage") - # use spot price to indicate spot bid percentage in case of awsbatch - self.parameters["SpotPrice"] = spot_bid_percentage - - if config.has_option(self.__cluster_section, "custom_awsbatch_template_url"): - awsbatch_custom_url = config.get(self.__cluster_section, "custom_awsbatch_template_url") - if not awsbatch_custom_url: - print( - "ERROR: custom_awsbatch_template_url set in [%s] section but not defined." % self.__cluster_section - ) - sys.exit(1) - self.parameters["CustomAWSBatchTemplateURL"] = awsbatch_custom_url - - # Set batch default size parameters - self.parameters["MinSize"] = "0" - self.parameters["DesiredSize"] = "4" - self.parameters["MaxSize"] = "20" - - # Override those parameters from config if they are available - for key in self.batch_size_parameters: - try: - __temp__ = config.get(self.__cluster_section, key) - if not __temp__: - print("ERROR: %s defined but not set in [%s] section" % (key, self.__cluster_section)) - sys.exit(1) - if key == "min_vcpus": - self.parameters["MinSize"] = __temp__ - elif key == "desired_vcpus": - self.parameters["DesiredSize"] = __temp__ - elif key == "max_vcpus": - self.parameters["MaxSize"] = __temp__ - except configparser.NoOptionError: - pass - - if self.__sanity_check: - config_sanity.check_resource( - self.region, self.aws_access_key_id, self.aws_secret_access_key, "AWSBatch_Parameters", self.parameters - ) diff --git a/cli/pcluster/cli.py b/cli/pcluster/cli.py index 6b23a2e9b5..dc971230e6 100644 --- a/cli/pcluster/cli.py +++ b/cli/pcluster/cli.py @@ -18,6 +18,7 @@ import textwrap import argparse +from botocore.exceptions import NoCredentialsError from pcluster import easyconfig, pcluster @@ -114,8 +115,8 @@ def _get_parser(): """ parser = argparse.ArgumentParser( description="pcluster is the AWS ParallelCluster CLI and permits " - "to launch and manage HPC clusters in the AWS cloud.", - epilog='For command specific flags run "pcluster [command] --help"', + "launching and management of HPC clusters in the AWS cloud.", + epilog='For command specific flags, please run: "pcluster [command] --help"', ) subparsers = parser.add_subparsers() subparsers.required = True @@ -123,8 +124,9 @@ def _get_parser(): # create command subparser create_example = textwrap.dedent( - """When the command is called and it starts polling for status of that call -it is safe to "Ctrl-C" out. You can always return to that status by calling "pcluster status mycluster". + """When the command is called and begins polling for status of that call +, it is safe to use 'Ctrl-C' to exit. You can return to viewing the current +status by calling "pcluster status mycluster". Examples:: @@ -139,7 +141,7 @@ def _get_parser(): ) pcreate.add_argument( "cluster_name", - help="name for the cluster. The CloudFormation Stack name will be " "parallelcluster-[cluster_name]", + help="Defines the name of the cluster. The CloudFormation stack name will be " "parallelcluster-[cluster_name]", ) _addarg_config(pcreate) _addarg_region(pcreate) @@ -150,17 +152,17 @@ def _get_parser(): pcreate.add_argument( "-u", "--template-url", - help="specify URL for the custom CloudFormation template, " "if it has been used at creation time", + help="specify a URL for the custom CloudFormation template, " "if it was used at creation time", ) pcreate.add_argument("-t", "--cluster-template", help="cluster template to use") pcreate.add_argument("-p", "--extra-parameters", type=json.loads, help="add extra parameters to stack create") - pcreate.add_argument("-g", "--tags", type=json.loads, help="tags to be added to the stack") + pcreate.add_argument("-g", "--tags", type=json.loads, help="additional tags to be added to the stack") pcreate.set_defaults(func=create) # update command subparser pupdate = subparsers.add_parser( "update", - help="Updates a running cluster by using the values in the config " "file or a TEMPLATE_URL provided.", + help="Updates a running cluster using the values in the config " "file or a TEMPLATE_URL provided.", epilog="When the command is called and it starts polling for status of that call " 'it is safe to "Ctrl-C" out. You can always return to that status by ' 'calling "pcluster status mycluster"', @@ -170,7 +172,7 @@ def _get_parser(): _addarg_region(pupdate) _addarg_nowait(pupdate) pupdate.add_argument( - "-nr", "--norollback", action="store_true", default=False, help="disable CloudFormation Stack rollback on error" + "-nr", "--norollback", action="store_true", default=False, help="disable CloudFormation stack rollback on error" ) pupdate.add_argument("-u", "--template-url", help="URL for a custom CloudFormation template") pupdate.add_argument("-t", "--cluster-template", help="specific cluster template to use") @@ -237,7 +239,7 @@ def _get_parser(): plist = subparsers.add_parser( "list", help="Displays a list of stacks associated with AWS ParallelCluster.", - epilog="Lists the Stack Name of the CloudFormation stacks named parallelcluster-*", + epilog="Lists the name of any CloudFormation stacks named parallelcluster-*", ) _addarg_config(plist) _addarg_region(plist) @@ -256,11 +258,11 @@ def _get_parser(): $ pcluster ssh mycluster -i ~/.ssh/id_rsa -results in an ssh command with username and IP address pre-filled:: +Returns an ssh command with the cluster username and IP address pre-populated:: $ ssh ec2-user@1.1.1.1 -i ~/.ssh/id_rsa -SSH command is defined in the global config file, under the aliases section and can be customized:: +SSH command is defined in the global config file under the aliases section and can be customized:: [aliases] ssh = ssh {CFN_USER}@{MASTER_IP} {ARGS} @@ -273,8 +275,8 @@ def _get_parser(): ) pssh = subparsers.add_parser( "ssh", - help="Connect to the master server using SSH.", - description="Run ssh command with username and IP address pre-filled. " + help="Connect to the master instance using SSH.", + description="Run ssh command with the cluster username and IP address pre-populated. " "Arbitrary arguments are appended to the end of the ssh command. " "This command may be customized in the aliases " "section of the config file.", @@ -299,7 +301,7 @@ def _get_parser(): "--os", dest="base_ami_os", required=True, - help="specify the OS of the base AMI. " "Valid values are alinux, ubuntu1404, ubuntu1604, centos6 or centos7", + help="specify the OS of the base AMI. " "Valid options are: alinux, ubuntu1404, ubuntu1604, centos6, centos7", ) pami.add_argument( "-ap", @@ -325,7 +327,7 @@ def _get_parser(): pconfigure.set_defaults(func=configure) # version command subparser - pversion = subparsers.add_parser("version", help="Display version of AWS ParallelCluster.") + pversion = subparsers.add_parser("version", help="Display the version of AWS ParallelCluster.") pversion.set_defaults(func=version) return parser @@ -340,11 +342,16 @@ def main(): parser = _get_parser() args, extra_args = parser.parse_known_args() logger.debug(args) - if args.func.__name__ == "command": - args.func(args, extra_args) - else: - if extra_args: - parser.print_usage() - print("Invalid arguments %s..." % extra_args) - sys.exit(1) - args.func(args) + + try: + if args.func.__name__ == "command": + args.func(args, extra_args) + else: + if extra_args: + parser.print_usage() + print("Invalid arguments %s..." % extra_args) + sys.exit(1) + args.func(args) + except NoCredentialsError: + logger.error("AWS Credentials not found.") + sys.exit(1) diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index 5835edf567..82a7a080dc 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -25,447 +25,557 @@ from botocore.exceptions import ClientError -def get_partition(region): - if region.startswith("us-gov"): - return "aws-us-gov" - return "aws" +class ResourceValidator(object): + """Utility class to check resource sanity.""" + def __init__(self, region, aws_access_key_id, aws_secret_access_key): + """ + Initialize a ResourceValidator object. -def check_sg_rules_for_port(rule, port_to_check): - port = rule.get("FromPort") - ip_rules = rule.get("IpRanges") - group = rule.get("UserIdGroupPairs") - for ip_rule in ip_rules: - ip = ip_rule.get("CidrIp") - # An existing rule is valid for EFS if, it allows all traffic(0.0.0.0/0) - # from all ports or NFS(port 2049), and does not have a security group restriction - if (not port or port == port_to_check) and ip == "0.0.0.0/0" and not group: - return True + :param region: AWS Region + :param aws_access_key_id: AWS access key + :param aws_secret_access_key: AWS secret access key + """ + self.region = region + self.aws_access_key_id = aws_access_key_id + self.aws_secret_access_key = aws_secret_access_key + def __get_partition(self): + if self.region.startswith("us-gov"): + return "aws-us-gov" + return "aws" -def check_efs_fs_id(ec2, efs, resource_value): # noqa: C901 FIXME!!! - try: - # Check to see if there is any existing mt on the fs - mt = efs.describe_mount_targets(FileSystemId=resource_value[0]) - # Get the availability zone of the stack - availability_zone = ( - ec2.describe_subnets(SubnetIds=[resource_value[1]]).get("Subnets")[0].get("AvailabilityZone") - ) - mt_id = None - for item in mt.get("MountTargets"): - # Check to see if there is an existing mt in the az of the stack - mt_subnet = item.get("SubnetId") - if availability_zone == ec2.describe_subnets(SubnetIds=[mt_subnet]).get("Subnets")[0].get( - "AvailabilityZone" - ): - mt_id = item.get("MountTargetId") - # If there is an existing mt in the az, need to check the inbound and outbound rules of the security groups - if mt_id: - nfs_access = False + @staticmethod + def __check_sg_rules_for_port(rule, port_to_check): + """ + Verify if the security group rule accepts connections to the given port. + + :param rule: The rule to check + :param port_to_check: The port to check + :return: True if the rule accepts connection, False otherwise + """ + port = rule.get("FromPort") + ip_rules = rule.get("IpRanges") + group = rule.get("UserIdGroupPairs") + + is_valid = False + for ip_rule in ip_rules: + ip = ip_rule.get("CidrIp") + # An existing rule is valid for EFS if, it allows all traffic(0.0.0.0/0) + # from all ports or the given port, and does not have a security group restriction + if (not port or port == port_to_check) and ip == "0.0.0.0/0" and not group: + is_valid = True + break + + return is_valid + + def __check_efs_fs_id(self, ec2, efs, resource_value): # noqa: C901 FIXME!!! + try: + # Check to see if there is any existing mt on the fs + mt = efs.describe_mount_targets(FileSystemId=resource_value[0]) + # Get the availability zone of the stack + availability_zone = ( + ec2.describe_subnets(SubnetIds=[resource_value[1]]).get("Subnets")[0].get("AvailabilityZone") + ) + mt_id = None + for item in mt.get("MountTargets"): + # Check to see if there is an existing mt in the az of the stack + mt_subnet = item.get("SubnetId") + if availability_zone == ec2.describe_subnets(SubnetIds=[mt_subnet]).get("Subnets")[0].get( + "AvailabilityZone" + ): + mt_id = item.get("MountTargetId") + # If there is an existing mt in the az, need to check the inbound and outbound rules of the security groups + if mt_id: + nfs_access = False + in_access = False + out_access = False + # Get list of security group IDs of the mount target + sg_ids = efs.describe_mount_target_security_groups(MountTargetId=mt_id).get("SecurityGroups") + for sg in ec2.describe_security_groups(GroupIds=sg_ids).get("SecurityGroups"): + # Check all inbound rules + in_rules = sg.get("IpPermissions") + for rule in in_rules: + if self.__check_sg_rules_for_port(rule, 2049): + in_access = True + break + out_rules = sg.get("IpPermissionsEgress") + for rule in out_rules: + if self.__check_sg_rules_for_port(rule, 2049): + out_access = True + break + if in_access and out_access: + nfs_access = True + break + if not nfs_access: + self.__fail( + "EFSFSId", + "There is an existing Mount Target %s in the Availability Zone %s for EFS %s, " + "and it does not have a security group with inbound and outbound rules that support NFS. " + "Please modify the Mount Target's security group, or delete the Mount Target." + % (mt_id, availability_zone, resource_value[0]), + ) + except ClientError as e: + self.__fail("EFSFSId", e.response.get("Error").get("Message")) + + def __check_nfs_access(self, ec2, network_interfaces): + nfs_access = False + for network_interface in network_interfaces: in_access = False out_access = False - # Get list of security group IDs of the mount target - sg_ids = efs.describe_mount_target_security_groups(MountTargetId=mt_id).get("SecurityGroups") + # Get list of security group IDs + sg_ids = [i.get("GroupId") for i in network_interface.get("Groups")] + # Check each sg to see if the rules are valid for sg in ec2.describe_security_groups(GroupIds=sg_ids).get("SecurityGroups"): # Check all inbound rules in_rules = sg.get("IpPermissions") for rule in in_rules: - if check_sg_rules_for_port(rule, 2049): + if self.__check_sg_rules_for_port(rule, 988): in_access = True break out_rules = sg.get("IpPermissionsEgress") for rule in out_rules: - if check_sg_rules_for_port(rule, 2049): + if self.__check_sg_rules_for_port(rule, 988): out_access = True break if in_access and out_access: nfs_access = True break - if not nfs_access: - print( - "Config sanity error: There is an existing Mount Target %s in the Availability Zone %s for EFS %s, " - "and it does not have a security group with inbound and outbound rules that support NFS. " - "Please modify the Mount Target's security group, or delete the Mount Target." - % (mt_id, availability_zone, resource_value[0]) - ) - sys.exit(1) - return True - except ClientError as e: - print("Config sanity error: %s" % e.response.get("Error").get("Message")) - sys.exit(1) - + if nfs_access: + return True -def check_resource( # noqa: C901 FIXME!!! - region, aws_access_key_id, aws_secret_access_key, resource_type, resource_value -): + return nfs_access - # Loop over all supported resource checks - # EC2 KeyPair - if resource_type == "EC2KeyPair": + def __check_fsx_fs_id(self, ec2, fsx, resource_value): try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_key_pairs(KeyNames=[resource_value]) + # Check to see if there is any existing mt on the fs + fs = fsx.describe_file_systems(FileSystemIds=[resource_value[0]]).get("FileSystems")[0] + stack_vpc = ec2.describe_subnets(SubnetIds=[resource_value[1]]).get("Subnets")[0].get("VpcId") + # Check to see if fs is in the same VPC as the stack + if fs.get("VpcId") != stack_vpc: + self.__fail( + "VpcId", + "Currently only support using FSx file system that is in the same VPC as the stack. " + "The file system provided is in %s" % fs.get("VpcId"), + ) + # If there is an existing mt in the az, need to check the inbound and outbound rules of the security groups + network_interface_ids = fs.get("NetworkInterfaceIds") + network_interface_responses = ec2.describe_network_interfaces( + NetworkInterfaceIds=network_interface_ids + ).get("NetworkInterfaces") + network_interfaces = [i for i in network_interface_responses if i.get("VpcId") == stack_vpc] + nfs_access = self.__check_nfs_access(ec2, network_interfaces) + if not nfs_access: + self.__fail( + "FSXFSId" + "The current security group settings on file system %s does not satisfy " + "mounting requirement. The file system must be associated to a security group that allows " + "inbound and outbound TCP traffic from 0.0.0.0/0 through port 988." % resource_value[0] + ) + return True except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - if resource_type == "EC2IAMRoleName": - try: - iam = boto3.client( - "iam", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) + self.__fail("FSXFSId", e.response.get("Error").get("Message")) - arn = iam.get_role(RoleName=resource_value).get("Role").get("Arn") - accountid = ( - boto3.client( - "sts", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, + def __validate_fsx_parameters(self, resource_type, resource_value): + # FSX FS Id check + if resource_type == "fsx_fs_id": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) - .get_caller_identity() - .get("Account") - ) + fsx = boto3.client( + "fsx", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + self.__check_fsx_fs_id(ec2, fsx, resource_value) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # FSX capacity size check + elif resource_type == "FSx_storage_capacity": + if int(resource_value) % 3600 != 0 or int(resource_value) < 0: + self.__fail( + resource_type, "Capacity for FSx lustre filesystem, minimum of 3,600 GB, increments of 3,600 GB" + ) + # FSX file chunk size check + elif resource_type == "FSx_imported_file_chunk_size": + # 1,024 MiB (1 GiB) and can go as high as 512,000 MiB + if not (1 <= int(resource_value) <= 512000): + self.__fail(resource_type, "has a minimum size of 1 MiB, and max size of 512,000 MiB") - partition = get_partition(region) + def validate(self, resource_type, resource_value): # noqa: C901 FIXME + """ + Validate the given resource. Print an error and exit in case of error. - iam_policy = [ - ( - [ - "ec2:DescribeVolumes", - "ec2:AttachVolume", - "ec2:DescribeInstanceAttribute", - "ec2:DescribeInstanceStatus", - "ec2:DescribeInstances", - ], - "*", - ), - (["dynamodb:ListTables"], "*"), - ( - [ - "sqs:SendMessage", - "sqs:ReceiveMessage", - "sqs:ChangeMessageVisibility", - "sqs:DeleteMessage", - "sqs:GetQueueUrl", - ], - "arn:%s:sqs:%s:%s:parallelcluster-*" % (partition, region, accountid), - ), - ( - [ - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "autoscaling:SetDesiredCapacity", - "autoscaling:DescribeTags", - "autoScaling:UpdateAutoScalingGroup", - ], - "*", - ), - ( - [ - "dynamodb:PutItem", - "dynamodb:Query", - "dynamodb:GetItem", - "dynamodb:DeleteItem", - "dynamodb:DescribeTable", - ], - "arn:%s:dynamodb:%s:%s:table/parallelcluster-*" % (partition, region, accountid), - ), - ( - ["cloudformation:DescribeStacks"], - "arn:%s:cloudformation:%s:%s:stack/parallelcluster-*" % (partition, region, accountid), - ), - (["s3:GetObject"], "arn:%s:s3:::%s-aws-parallelcluster/*" % (partition, region)), - (["sqs:ListQueues"], "*"), - ] + :param resource_type: Resource type + :param resource_value: Resource value + """ + # Loop over all supported resource checks + if resource_type == "EC2KeyPair": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + ec2.describe_key_pairs(KeyNames=[resource_value]) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + if resource_type == "EC2IAMRoleName": + try: + iam = boto3.client( + "iam", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) - for actions, resource_arn in iam_policy: - response = iam.simulate_principal_policy( - PolicySourceArn=arn, ActionNames=actions, ResourceArns=[resource_arn] + arn = iam.get_role(RoleName=resource_value).get("Role").get("Arn") + account_id = ( + boto3.client( + "sts", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + .get_caller_identity() + .get("Account") ) - for decision in response.get("EvaluationResults"): - if decision.get("EvalDecision") != "allowed": - print( - "IAM role error on user provided role %s: action %s is %s" - % (resource_value, decision.get("EvalActionName"), decision.get("EvalDecision")) - ) - print("See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html") - sys.exit(1) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # VPC Id - elif resource_type == "VPC": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_vpcs(VpcIds=[resource_value]) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # Check for DNS support in the VPC - if ( - not ec2.describe_vpc_attribute(VpcId=resource_value, Attribute="enableDnsSupport") - .get("EnableDnsSupport") - .get("Value") - ): - print("DNS Support is not enabled in %s" % resource_value) - sys.exit(1) - if ( - not ec2.describe_vpc_attribute(VpcId=resource_value, Attribute="enableDnsHostnames") - .get("EnableDnsHostnames") - .get("Value") - ): - print("DNS Hostnames not enabled in %s" % resource_value) - sys.exit(1) - # VPC Subnet Id - elif resource_type == "VPCSubnet": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_subnets(SubnetIds=[resource_value]) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # VPC Security Group - elif resource_type == "VPCSecurityGroup": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_security_groups(GroupIds=[resource_value]) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # EC2 AMI Id - elif resource_type == "EC2Ami": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_images(ImageIds=[resource_value]) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # EC2 Placement Group - elif resource_type == "EC2PlacementGroup": - if resource_value == "DYNAMIC": - pass - else: + partition = self.__get_partition() + + iam_policy = [ + ( + [ + "ec2:DescribeVolumes", + "ec2:AttachVolume", + "ec2:DescribeInstanceAttribute", + "ec2:DescribeInstanceStatus", + "ec2:DescribeInstances", + ], + "*", + ), + (["dynamodb:ListTables"], "*"), + ( + [ + "sqs:SendMessage", + "sqs:ReceiveMessage", + "sqs:ChangeMessageVisibility", + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + ], + "arn:%s:sqs:%s:%s:parallelcluster-*" % (partition, self.region, account_id), + ), + ( + [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "autoscaling:SetDesiredCapacity", + "autoscaling:DescribeTags", + "autoScaling:UpdateAutoScalingGroup", + ], + "*", + ), + ( + [ + "dynamodb:PutItem", + "dynamodb:Query", + "dynamodb:GetItem", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable", + ], + "arn:%s:dynamodb:%s:%s:table/parallelcluster-*" % (partition, self.region, account_id), + ), + ( + ["cloudformation:DescribeStacks"], + "arn:%s:cloudformation:%s:%s:stack/parallelcluster-*" % (partition, self.region, account_id), + ), + (["s3:GetObject"], "arn:%s:s3:::%s-aws-parallelcluster/*" % (partition, self.region)), + (["sqs:ListQueues"], "*"), + ] + + for actions, resource_arn in iam_policy: + response = iam.simulate_principal_policy( + PolicySourceArn=arn, ActionNames=actions, ResourceArns=[resource_arn] + ) + for decision in response.get("EvaluationResults"): + if decision.get("EvalDecision") != "allowed": + print( + "IAM role error on user provided role %s: action %s is %s" + % (resource_value, decision.get("EvalActionName"), decision.get("EvalDecision")) + ) + print("See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html") + sys.exit(1) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # VPC Id + elif resource_type == "VPC": try: ec2 = boto3.client( "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) - test = ec2.describe_placement_groups(GroupNames=[resource_value]) + ec2.describe_vpcs(VpcIds=[resource_value]) except ClientError as e: - print( - "Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message")) - ) - sys.exit(1) - # URL - elif resource_type == "URL": - scheme = urlparse(resource_value).scheme - if scheme == "s3": - pass - else: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # Check for DNS support in the VPC + if ( + not ec2.describe_vpc_attribute(VpcId=resource_value, Attribute="enableDnsSupport") + .get("EnableDnsSupport") + .get("Value") + ): + self.__fail(resource_type, "DNS Support is not enabled in %s" % resource_value) + if ( + not ec2.describe_vpc_attribute(VpcId=resource_value, Attribute="enableDnsHostnames") + .get("EnableDnsHostnames") + .get("Value") + ): + self.__fail(resource_type, "DNS Hostnames not enabled in %s" % resource_value) + # VPC Subnet Id + elif resource_type == "VPCSubnet": try: - urllib.request.urlopen(resource_value) - except urllib.error.HTTPError as e: - print("Config sanity error:", resource_value, e.code, e.reason) - sys.exit(1) - except urllib.error.URLError as e: - print("Config sanity error:", resource_value, e.reason) - sys.exit(1) - # EC2 EBS Snapshot Id - elif resource_type == "EC2Snapshot": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_snapshots(SnapshotIds=[resource_value]).get("Snapshots")[0] - if test.get("State") != "completed": - print("Snapshot %s is in state '%s' not 'completed'" % (resource_value, test.get("State"))) - sys.exit(1) - except ClientError as e: - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # EC2 EBS Volume Id - elif resource_type == "EC2Volume": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - test = ec2.describe_volumes(VolumeIds=[resource_value]).get("Volumes")[0] - if test.get("State") != "available": - print("Volume %s is in state '%s' not 'available'" % (resource_value, test.get("State"))) - sys.exit(1) - except ClientError as e: - if e.response.get("Error").get("Message").endswith("parameter volumes is invalid. Expected: 'vol-...'."): - print("Config sanity error: volume %s does not exist." % resource_value) - sys.exit(1) - print("Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message"))) - sys.exit(1) - # EFS file system Id - elif resource_type == "EFSFSId": - try: - ec2 = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - efs = boto3.client( - "efs", - region_name=region, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - return check_efs_fs_id(ec2, efs, resource_value) - except ClientError as e: - print("Config sanity error: %s" % e.response.get("Error").get("Message")) - sys.exit(1) - # EFS Performance Mode check - elif resource_type == "EFSPerfMode": - if resource_value != "generalPurpose" and resource_value != "maxIO": - print( - "Config sanity error: Invalid value for 'performance_mode'! " - "Acceptable values for 'performance_mode' are generalPurpose and maxIO" - ) - sys.exit(1) - # EFS Throughput check - elif resource_type == "EFSThroughput": - throughput_mode = resource_value[0] - provisioned_throughput = resource_value[1] - if throughput_mode and (throughput_mode != "provisioned" and throughput_mode != "bursting"): - print( - "Config sanity error: Invalid value for 'throughput_mode'! " - "Acceptable values for 'throughput_mode' are bursting and provisioned" - ) - sys.exit(1) - if provisioned_throughput is not None: - if throughput_mode != "provisioned": - print( - "Config sanity error: When specifying 'provisioned_throughput', " - "the 'throughput_mode' must be set to provisioned" + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) - sys.exit(1) - else: - if throughput_mode == "provisioned": - print( - "Config sanity error: When specifying 'throughput_mode' to provisioned, " - "the 'provisioned_throughput' option must be specified" + ec2.describe_subnets(SubnetIds=[resource_value]) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # VPC Security Group + elif resource_type == "VPCSecurityGroup": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) - sys.exit(1) - # RAID EBS IOPS - elif resource_type == "RAIDIOPS": - raid_iops = float(resource_value[0]) - raid_vol_size = float(resource_value[1]) - if raid_iops > raid_vol_size * 50: - print( - "Config sanity error: IOPS to volume size ratio of %s is too high; maximum is 50." - % (raid_iops / raid_vol_size) - ) - sys.exit(1) - # RAID Array Type - elif resource_type == "RAIDType": - if resource_value != "0" and resource_value != "1": - print("Config sanity error: invalid raid_type, only RAID 0 and RAID 1 are currently supported.") - sys.exit(1) - # Number of RAID Volumes Requested - elif resource_type == "RAIDNumVol": - if int(resource_value) > 5 or int(resource_value) < 2: - print( - "Config sanity error: invalid num_of_raid_volumes. " - "Needs min of 2 volumes for RAID and max of 5 EBS volumes are currently supported." - ) - sys.exit(1) - # Batch Parameters - elif resource_type == "AWSBatch_Parameters": - # Check region - if region in ["ap-northeast-3", "eu-north-1", "cn-north-1", "cn-northwest-1", "us-gov-east-1", "us-gov-west-1"]: - print("ERROR: %s region is not supported with awsbatch" % region) - sys.exit(1) - - # Check compute instance types - if "ComputeInstanceType" in resource_value: + ec2.describe_security_groups(GroupIds=[resource_value]) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # EC2 AMI Id + elif resource_type == "EC2Ami": try: - s3 = boto3.resource("s3", region_name=region) - bucket_name = "%s-aws-parallelcluster" % region - file_name = "instances/batch_instances.json" + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + ec2.describe_images(ImageIds=[resource_value]) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # EC2 Placement Group + elif resource_type == "EC2PlacementGroup": + if resource_value == "DYNAMIC": + pass + else: try: - file_contents = s3.Object(bucket_name, file_name).get()["Body"].read().decode("utf-8") - supported_instances = json.loads(file_contents) - for instance in resource_value["ComputeInstanceType"].split(","): - if not instance.strip() in supported_instances: - print("Instance type %s not supported by batch in this region" % instance) - sys.exit(1) + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + ec2.describe_placement_groups(GroupNames=[resource_value]) except ClientError as e: - print( - "Config sanity error on resource %s: %s" - % (resource_type, e.response.get("Error").get("Message")) + self.__fail(resource_type, e.response.get("Error").get("Message")) + # URL + elif resource_type == "URL": + scheme = urlparse(resource_value).scheme + if scheme == "s3": + pass + else: + try: + urllib.request.urlopen(resource_value) + except urllib.error.HTTPError as e: + self.__fail(resource_type, "%s %s %s" % (resource_value, e.code, e.reason)) + except urllib.error.URLError as e: + self.__fail(resource_type, "%s %s" % (resource_value, e.reason)) + # EC2 EBS Snapshot Id + elif resource_type == "EC2Snapshot": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + test = ec2.describe_snapshots(SnapshotIds=[resource_value]).get("Snapshots")[0] + if test.get("State") != "completed": + self.__fail( + resource_type, + "Snapshot %s is in state '%s' not 'completed'" % (resource_value, test.get("State")), ) - sys.exit(1) except ClientError as e: - print( - "Config sanity error on resource %s: %s" % (resource_type, e.response.get("Error").get("Message")) + self.__fail(resource_type, e.response.get("Error").get("Message")) + # EC2 EBS Volume Id + elif resource_type == "EC2Volume": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) - sys.exit(1) + test = ec2.describe_volumes(VolumeIds=[resource_value]).get("Volumes")[0] + if test.get("State") != "available": + self.__fail( + resource_type, + "Volume %s is in state '%s' not 'available'" % (resource_value, test.get("State")), + ) + except ClientError as e: + if ( + e.response.get("Error") + .get("Message") + .endswith("parameter volumes is invalid. Expected: 'vol-...'.") + ): + self.__fail(resource_type, "Volume %s does not exist." % resource_value) - # Check spot bid percentage - if "SpotPrice" in resource_value: - if int(resource_value["SpotPrice"]) > 100 or int(resource_value["SpotPrice"]) < 0: - print("ERROR: Spot bid percentage needs to be between 0 and 100") - sys.exit(1) + self.__fail(resource_type, e.response.get("Error").get("Message")) + # EFS file system Id + elif resource_type == "EFSFSId": + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + efs = boto3.client( + "efs", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + self.__check_efs_fs_id(ec2, efs, resource_value) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + # EFS Performance Mode check + elif resource_type == "EFSPerfMode": + if resource_value != "generalPurpose" and resource_value != "maxIO": + self.__fail( + resource_type, + "Invalid value for 'performance_mode'! " + "Acceptable values for 'performance_mode' are generalPurpose and maxIO", + ) + # EFS Throughput check + elif resource_type == "EFSThroughput": + throughput_mode = resource_value[0] + provisioned_throughput = resource_value[1] + if throughput_mode and (throughput_mode != "provisioned" and throughput_mode != "bursting"): + self.__fail( + resource_type, + "Invalid value for 'throughput_mode'! " + "Acceptable values for 'throughput_mode' are bursting and provisioned", + ) + if provisioned_throughput is not None: + if throughput_mode != "provisioned": + self.__fail( + resource_type, + "When specifying 'provisioned_throughput', the 'throughput_mode' must be set to provisioned", + ) + else: + if throughput_mode == "provisioned": + self.__fail( + resource_type, + "When specifying 'throughput_mode' to provisioned, " + "the 'provisioned_throughput' option must be specified", + ) + # RAID EBS IOPS + elif resource_type == "RAIDIOPS": + raid_iops = float(resource_value[0]) + raid_vol_size = float(resource_value[1]) + if raid_iops > raid_vol_size * 50: + self.__fail( + resource_type, + "IOPS to volume size ratio of %s is too high; maximum is 50." % (raid_iops / raid_vol_size), + ) + # RAID Array Type + elif resource_type == "RAIDType": + if resource_value != "0" and resource_value != "1": + self.__fail(resource_type, "Invalid raid_type, only RAID 0 and RAID 1 are currently supported.") + # Number of RAID Volumes Requested + elif resource_type == "RAIDNumVol": + if int(resource_value) > 5 or int(resource_value) < 2: + self.__fail( + resource_type, + "Invalid num_of_raid_volumes. " + "Needs min of 2 volumes for RAID and max of 5 EBS volumes are currently supported.", + ) + # FSX FS Id check + elif resource_type in ["fsx_fs_id", "FSx_storage_capacity", "FSx_imported_file_chunk_size"]: + self.__validate_fsx_parameters(resource_type, resource_value) + # Batch Parameters + elif resource_type == "AWSBatch_Parameters": + # Check region + if self.region in [ + "ap-northeast-3", + "eu-north-1", + "cn-north-1", + "cn-northwest-1", + "us-gov-east-1", + "us-gov-west-1", + ]: + self.__fail(resource_type, "Region %s is not supported with batch scheduler" % self.region) + + # Check compute instance types + if "ComputeInstanceType" in resource_value: + try: + s3 = boto3.resource("s3", region_name=self.region) + bucket_name = "%s-aws-parallelcluster" % self.region + file_name = "instances/batch_instances.json" + try: + file_contents = s3.Object(bucket_name, file_name).get()["Body"].read().decode("utf-8") + supported_instances = json.loads(file_contents) + for instance in resource_value["ComputeInstanceType"].split(","): + if not instance.strip() in supported_instances: + self.__fail( + resource_type, "Instance type %s not supported by batch in this region" % instance + ) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) - # Check sanity on desired, min and max vcpus - if "DesiredSize" in resource_value and "MinSize" in resource_value: - if int(resource_value["DesiredSize"]) < int(resource_value["MinSize"]): - print("ERROR: Desired vcpus must be greater than or equal to min vcpus") - sys.exit(1) + # Check spot bid percentage + if "SpotPrice" in resource_value: + if int(resource_value["SpotPrice"]) > 100 or int(resource_value["SpotPrice"]) < 0: + self.__fail(resource_type, "Spot bid percentage needs to be between 0 and 100") - if "DesiredSize" in resource_value and "MaxSize" in resource_value: - if int(resource_value["DesiredSize"]) > int(resource_value["MaxSize"]): - print("ERROR: Desired vcpus must be fewer than or equal to max vcpus") - sys.exit(1) + # Check sanity on desired, min and max vcpus + if "DesiredSize" in resource_value and "MinSize" in resource_value: + if int(resource_value["DesiredSize"]) < int(resource_value["MinSize"]): + self.__fail(resource_type, "Desired vcpus must be greater than or equal to min vcpus") - if "MaxSize" in resource_value and "MinSize" in resource_value: - if int(resource_value["MaxSize"]) < int(resource_value["MinSize"]): - print("ERROR: Max vcpus must be greater than or equal to min vcpus") - sys.exit(1) + if "DesiredSize" in resource_value and "MaxSize" in resource_value: + if int(resource_value["DesiredSize"]) > int(resource_value["MaxSize"]): + self.__fail(resource_type, "Desired vcpus must be fewer than or equal to max vcpus") - # Check custom batch url - if "CustomAWSBatchTemplateURL" in resource_value: - check_resource( - region, aws_access_key_id, aws_secret_access_key, "URL", resource_value["CustomAWSBatchTemplateURL"] - ) + if "MaxSize" in resource_value and "MinSize" in resource_value: + if int(resource_value["MaxSize"]) < int(resource_value["MinSize"]): + self.__fail(resource_type, "Max vcpus must be greater than or equal to min vcpus") + + # Check custom batch url + if "CustomAWSBatchTemplateURL" in resource_value: + self.validate("URL", resource_value["CustomAWSBatchTemplateURL"]) + + @staticmethod + def __fail(resource_type, message): + """ + Print an error and exit. + + :param resource_type: Resource on which the config sanity check failed + :param message: the message to print + """ + print("Config sanity error on resource %s: %s" % (resource_type, message)) + sys.exit(1) diff --git a/cli/pcluster/easyconfig.py b/cli/pcluster/easyconfig.py index ec7cfc82c5..b89ab4f346 100644 --- a/cli/pcluster/easyconfig.py +++ b/cli/pcluster/easyconfig.py @@ -243,7 +243,7 @@ def configure(args): # noqa: C901 FIXME!!! pass for key, value in section.items(): # Only update configuration if not set - if value is not None and key is not "__name__": + if value is not None and key != "__name__": config.set(section["__name__"], key, value) # ensure that the directory for the config file exists (because diff --git a/cli/pcluster/examples/config b/cli/pcluster/examples/config index f139ac10bd..69b8af7761 100644 --- a/cli/pcluster/examples/config +++ b/cli/pcluster/examples/config @@ -91,8 +91,8 @@ key_name = mykey # (defaults to NONE) #placement_group = NONE # Cluster placement logic. This enables the whole cluster or only compute to use the placement group -# (defaults to cluster) -#placement = cluster +# (defaults to compute) +#placement = compute # Path/mountpoint for ephemeral drives # (defaults to /scratch) #ephemeral_dir = /scratch diff --git a/cli/pcluster/pcluster.py b/cli/pcluster/pcluster.py index c554a26856..a1377521b4 100644 --- a/cli/pcluster/pcluster.py +++ b/cli/pcluster/pcluster.py @@ -64,8 +64,8 @@ def create_bucket_with_batch_resources(stack_name, aws_client_config, resources_ def version(args): - config = cfnconfig.ParallelClusterConfig(args) - LOGGER.info(config.version) + pcluster_version = pkg_resources.get_distribution("aws-parallelcluster").version + LOGGER.info(pcluster_version) def create(args): # noqa: C901 FIXME!!! @@ -577,7 +577,59 @@ def instances(args): LOGGER.info("Run 'awsbhosts --cluster %s' to list the compute instances", args.cluster_name) -def command(args, extra_args): +def _get_master_server_ip(stack_name, config): + """ + Get the IP Address of the MasterServer. + + :param stack_name: The name of the cloudformation stack + :param config: Config object + :return private/public ip address + """ + ec2 = boto3.client( + "ec2", + region_name=config.region, + aws_access_key_id=config.aws_access_key_id, + aws_secret_access_key=config.aws_secret_access_key, + ) + + master_id = get_master_server_id(stack_name, config) + if master_id is []: + LOGGER.info("MasterServer not running. Can't SSH") + sys.exit(1) + instance = ec2.describe_instances(InstanceIds=[master_id]).get("Reservations")[0].get("Instances")[0] + ip_address = instance.get("PublicIpAddress") + if ip_address is None: + ip_address = instance.get("PrivateIpAddress") + state = instance.get("State").get("Name") + if state != "running" or ip_address is None: + LOGGER.info("MasterServer: %s\nCannot get ip address.", state.upper()) + sys.exit(1) + return ip_address + + +def _get_output_value(outputs, key_name): + """ + Get output value from Cloudformation Stack Output. + + :param outputs: Cloudformation Stack Outputs + :param key_name: Output Key + :return: OutputValue if that output exists, otherwise None + """ + return next((o.get("OutputValue") for o in outputs if o.get("OutputKey") == key_name), None) + + +def _get_param_value(params, key_name): + """ + Get parameter value from Cloudformation Stack Parameters. + + :param outputs: Cloudformation Stack Parameters + :param key_name: Parameter Key + :return: ParameterValue if that parameter exists, otherwise None + """ + return next((i.get("ParameterValue") for i in params if i.get("ParameterKey") == key_name), None) + + +def command(args, extra_args): # noqa: C901 FIXME!!! stack = "parallelcluster-" + args.cluster_name config = cfnconfig.ParallelClusterConfig(args) if args.command in config.aliases: @@ -594,13 +646,35 @@ def command(args, extra_args): try: stack_result = cfn.describe_stacks(StackName=stack).get("Stacks")[0] status = stack_result.get("StackStatus") - valid_status = ["CREATE_COMPLETE", "UPDATE_COMPLETE"] - if status not in valid_status: - LOGGER.info("Stack status: %s. Stack needs to be in %s", status, " or ".join(valid_status)) + valid_status = ["CREATE_COMPLETE", "UPDATE_COMPLETE", "UPDATE_ROLLBACK_COMPLETE"] + invalid_status = ["DELETE_COMPLETE", "DELETE_IN_PROGRESS"] + + if status in invalid_status: + LOGGER.info("Stack status: %s. Cannot SSH while in %s", status, " or ".join(invalid_status)) sys.exit(1) - outputs = stack_result.get("Outputs") - username = [o.get("OutputValue") for o in outputs if o.get("OutputKey") == "ClusterUser"][0] - ip = [o.get("OutputValue") for o in outputs if o.get("OutputKey") == "MasterPublicIP"][0] + elif status in valid_status: + outputs = stack_result.get("Outputs") + username = _get_output_value(outputs, "ClusterUser") + ip = ( + _get_output_value(outputs, "MasterPublicIP") + if _get_output_value(outputs, "MasterPublicIP") + else _get_output_value(outputs, "MasterPrivateIP") + ) + + if not username: + LOGGER.info("Failed to get cluster %s username.", args.cluster_name) + sys.exit(1) + + if not ip: + LOGGER.info("Failed to get cluster %s ip.", args.cluster_name) + sys.exit(1) + else: + # Stack is in CREATING, CREATED_FAILED, or ROLLBACK_COMPLETE but MasterServer is running + ip = _get_master_server_ip(stack, config) + template = cfn.get_template(StackName=stack) + mappings = template.get("TemplateBody").get("Mappings").get("OSFeatures") + base_os = _get_param_value(stack_result.get("Parameters"), "BaseOS") + username = mappings.get(base_os).get("User") try: from shlex import quote as cmd_quote @@ -730,7 +804,7 @@ def delete(args): sys.stdout.write("\n") sys.stdout.flush() if status == "DELETE_FAILED": - LOGGER.info("Cluster did not delete successfully. Run 'pcluster delete %s' again", stack) + LOGGER.info("Cluster did not delete successfully. Run 'pcluster delete %s' again", args.cluster_name) except ClientError as e: if e.response.get("Error").get("Message").endswith("does not exist"): if saw_update: diff --git a/cli/pcluster/utils.py b/cli/pcluster/utils.py index 3d1cc9698f..3129ad609b 100644 --- a/cli/pcluster/utils.py +++ b/cli/pcluster/utils.py @@ -10,6 +10,7 @@ # limitations under the License. from __future__ import absolute_import, print_function +import json import os import zipfile from io import BytesIO @@ -111,3 +112,37 @@ def upload_resources_artifacts(bucket_name, root, aws_client_config): bucket.upload_fileobj(zip_dir(os.path.join(root, res)), "%s/artifacts.zip" % res) elif os.path.isfile(os.path.join(root, res)): bucket.upload_file(os.path.join(root, res), res) + + +def get_instances_from_pricing_file(region): + """ + Get pricing file and get supported instances. + + :param region: AWS Region + :return: a json object representing the pricing file content. + :raises ClientError if unable to download the pricing file. + """ + s3 = boto3.resource("s3", region_name=region) + bucket_name = "%s-aws-parallelcluster" % region + file_name = "instances/instances.json" + + file_contents = s3.Object(bucket_name, file_name).get()["Body"].read().decode("utf-8") + return json.loads(file_contents) + + +def get_vcpus_from_pricing_file(region, instance_type): + """ + Read instances json object (fetching it if None) and get number of vcpus for the given instance type. + + :param region: AWS Region + :param instance_type: the instance type to search for. + :return: the number of vcpus or -1 if the instance type cannot be found + :raises ClientError if unable to download the pricing file. + """ + try: + instances = get_instances_from_pricing_file(region) + vcpus = int(instances[instance_type]["vcpus"]) + except KeyError: + vcpus = -1 + + return vcpus diff --git a/cli/requirements.txt b/cli/requirements.txt index 57d3af4c34..39b15b89f6 100644 --- a/cli/requirements.txt +++ b/cli/requirements.txt @@ -1 +1 @@ -boto3>=1.7.33 +boto3>=1.9.48,<=1.9.85 diff --git a/cli/requirements26.txt b/cli/requirements26.txt index 155bd149c6..030faeadd3 100644 --- a/cli/requirements26.txt +++ b/cli/requirements26.txt @@ -1,2 +1,2 @@ -boto3>=1.7.33 -argparse>=1.1 \ No newline at end of file +boto3>=1.9.48,<=1.9.85 +argparse==1.4.0 diff --git a/cli/setup.py b/cli/setup.py index 1c1f14a213..740a669ea1 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -21,16 +21,16 @@ def readme(): return f.read() -VERSION = "2.1.1" -REQUIRES = ["boto3>=1.9.48", "awscli>=1.11.175", "future>=0.16.0", "tabulate>=0.8.2"] +VERSION = "2.2.1" +REQUIRES = ["boto3>=1.9.48,<=1.9.101", "future>=0.16.0,<=0.17.1", "tabulate>=0.8.2,<=0.8.3"] if sys.version_info[:2] == (2, 6): # For python2.6 we have to require argparse since it # was not in stdlib until 2.7. - REQUIRES.append("argparse>=1.4.0") + REQUIRES.append("argparse==1.4.0") if sys.version_info[0] == 2: - REQUIRES.append("configparser>=3.5.0") + REQUIRES.append("configparser>=3.5.0,<=3.5.3") setup( name="aws-parallelcluster", diff --git a/cli/tests/pcluster/test.sh b/cli/tests/pcluster/test.sh index a28c5f4929..b98eac58ad 100755 --- a/cli/tests/pcluster/test.sh +++ b/cli/tests/pcluster/test.sh @@ -1,8 +1,9 @@ #!/bin/bash # Very basic first tests -set -x +set -ex echo $PATH which pcluster +pip check pcluster version pcluster --help diff --git a/cli/tox.ini b/cli/tox.ini index c06eeeb103..0a18dd7af4 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -103,7 +103,6 @@ deps = # flake8-import-order # delegated to isort flake8-colors pep8-naming - flake8-per-file-ignores commands = flake8 \ setup.py \ @@ -111,6 +110,7 @@ commands = pcluster/ \ tests/ \ ../cloudformation/ \ + ../tests/integration-tests/ \ {posargs} # bandit security linter for python: https://github.com/PyCQA/bandit @@ -207,9 +207,14 @@ changedir = ../cloudformation deps = cfn-lint # E2504 disabled since does not allow two-digit numbers in ephemeral(n) -# E2502 disabled since was not working correctly in the substack # W2507 disabled since we want to have nullable String type parameters -commands = cfn-lint -iE2504 -iE2502 -iW2507 *.cfn.json +# E2523 disabled since we have both a Launch Template and Launch Configuration +commands = + cfn-lint -iE2504 -iW2507 -iE2523 aws-parallelcluster.cfn.json + cfn-lint batch-substack.cfn.json + cfn-lint ebs-substack.cfn.json + cfn-lint efs-substack.cfn.json + cfn-lint raid-substack.cfn.json # Validates that cfn json templates are correctly formatted. [testenv:cfn-format-check] diff --git a/cloudformation/README.rst b/cloudformation/README.rst index eb59d8d334..d82ae73fe1 100644 --- a/cloudformation/README.rst +++ b/cloudformation/README.rst @@ -1,3 +1,23 @@ -The YAML CloudFormation template can be generated by converting the ``aws-parallelcluster.cfn.json`` file with the *AWS CloudFormation Template Flip* tool. +### Autogenerated Templates -https://github.com/awslabs/aws-cfn-template-flip \ No newline at end of file +The following stack are generated with Troposphere. +* `efs-substack.cfn.json` +* `ebs-substack.cfn.json` +* `fsx-substack.cfn.json` +* `raid-substack.cfn.json` + +To make changes and re-generate do: + +``` +cd cli/ +python ../util/generate-fsx-substack.py --target-path ../cloudformation/fsx-substack.cfn.json +tox -e cfn-format +# check for differences +git diff +``` + +### JSON to YAML + +The YAML CloudFormation template can be generated by converting the ``aws-parallelcluster.cfn.json`` file with the + +*AWS CloudFormation Template Flip* tool. See https://github.com/awslabs/aws-cfn-template-flip \ No newline at end of file diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index aa8fc0d266..906da4078e 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion": "2010-09-09", - "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.1.1", + "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.2.1", "Metadata": { "AWS::CloudFormation::Interface": { "ParameterGroups": [ @@ -414,7 +414,7 @@ "Placement": { "Description": "Type of placement requird in AWS ParallelCluster, it can either be cluster or compute.", "Type": "String", - "Default": "cluster", + "Default": "compute", "AllowedValues": [ "cluster", "compute" @@ -589,6 +589,11 @@ "Type": "Number", "Default": "1" }, + "FSXOptions": { + "Description": "Comma separated list of FSx related options, 8 parameters in total, [shared_dir,fsx_fs_id,storage_capacity,fsx_kms_key_id,imported_file_chunk_size,export_path,import_path,weekly_maintenance_start_time]", + "Type": "String", + "Default": "NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE" + }, "EFSOptions": { "Description": "Comma separated list of efs related options, 8 parameters in total, [shared_dir,efs_fs_id,performance_mode,efs_kms_key_id,provisioned_throughput,encrypted,throughput_mode,valid_existing_MTorNot]", "Type": "String", @@ -808,6 +813,28 @@ } ] }, + "CreateFSXSubstack": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "0", + { + "Fn::Split": [ + ",", + { + "Ref": "FSXOptions" + } + ] + } + ] + }, + "NONE" + ] + } + ] + }, "UseSpotInstances": { "Fn::Equals": [ { @@ -932,14 +959,21 @@ ] }, "UseS3ReadPolicy": { - "Fn::Not": [ + "Fn::And": [ { - "Fn::Equals": [ + "Fn::Not": [ { - "Ref": "S3ReadResource" - }, - "NONE" + "Fn::Equals": [ + { + "Ref": "S3ReadResource" + }, + "NONE" + ] + } ] + }, + { + "Condition": "CreateEC2IAMRole" } ] }, @@ -971,14 +1005,21 @@ ] }, "UseS3ReadWritePolicy": { - "Fn::Not": [ + "Fn::And": [ { - "Fn::Equals": [ + "Fn::Not": [ { - "Ref": "S3ReadWriteResource" - }, - "NONE" + "Fn::Equals": [ + { + "Ref": "S3ReadWriteResource" + }, + "NONE" + ] + } ] + }, + { + "Condition": "CreateEC2IAMRole" } ] }, @@ -1171,141 +1212,141 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-0ac1d0c35dc3b2a97", - "centos6": "ami-003cfe6266cadd576", - "centos7": "ami-0c0049e4eeb0ef1ac", - "ubuntu1404": "ami-053a84278e34b7a59", - "ubuntu1604": "ami-026669cfcef23b3de" + "alinux": "ami-01e7e307e734daa9b", + "centos6": "ami-09c0d7b7eba653962", + "centos7": "ami-0307ad022fdd9bd50", + "ubuntu1404": "ami-0141f9239dd88f7eb", + "ubuntu1604": "ami-00bacec2848062b6b" }, "ap-northeast-2": { - "alinux": "ami-0dddb89f66485f828", - "centos6": "ami-032b3f2a4f1ac91a0", - "centos7": "ami-00f8f46a043a04530", - "ubuntu1404": "ami-03d314c5ac10beb53", - "ubuntu1604": "ami-048214d1413ed8462" + "alinux": "ami-096da9252851971b7", + "centos6": "ami-04bb0577f1425f61f", + "centos7": "ami-01610b2d147974908", + "ubuntu1404": "ami-069d8e1e9b74ee48d", + "ubuntu1604": "ami-0a6edfea96ed6c9db" }, "ap-northeast-3": { - "alinux": "ami-0dbb09bfada65298f", - "centos6": "ami-0c4de7aece7b2db33", - "centos7": "ami-0043e5e12872a00d4", - "ubuntu1404": "ami-070f9069b20a797ec", - "ubuntu1604": "ami-038900edb73cb9496" + "alinux": "ami-0fde1cce0915721ec", + "centos6": "ami-092c70a47ecb4f730", + "centos7": "ami-04dd95194c8990d40", + "ubuntu1404": "ami-07853be3b108fb14c", + "ubuntu1604": "ami-09eeb02948489e793" }, "ap-south-1": { - "alinux": "ami-0ed1bb8f2ab0edffb", - "centos6": "ami-0581583c3d7507d9e", - "centos7": "ami-03dd063b05c3082f1", - "ubuntu1404": "ami-0641d7ca885530589", - "ubuntu1604": "ami-0cc424ec58256ea88" + "alinux": "ami-0856113449f34cfde", + "centos6": "ami-0b6b4a5c5952ba214", + "centos7": "ami-0d848603d9bf30b76", + "ubuntu1404": "ami-02744b942f4094210", + "ubuntu1604": "ami-0dcc47340ec3a0e45" }, "ap-southeast-1": { - "alinux": "ami-058bcd8377aba9bef", - "centos6": "ami-0d00309b80b772532", - "centos7": "ami-00fcdc55bd29f691e", - "ubuntu1404": "ami-0e3de99412375e882", - "ubuntu1604": "ami-059ba95190db36590" + "alinux": "ami-019762c344b80574a", + "centos6": "ami-0d8fee73833faed87", + "centos7": "ami-084b71a47f7c857b7", + "ubuntu1404": "ami-070fef5acc9fd3de2", + "ubuntu1604": "ami-0771cf73e73259040" }, "ap-southeast-2": { - "alinux": "ami-012557ce9426ef1a0", - "centos6": "ami-00ecf7e455945e8bc", - "centos7": "ami-0baa1787fd7a71950", - "ubuntu1404": "ami-09eae4580e8fc835a", - "ubuntu1604": "ami-04df2433ab61d3f37" + "alinux": "ami-09ddff457fb3815ea", + "centos6": "ami-03e0e45d59cf130b3", + "centos7": "ami-0b06de8c266942eff", + "ubuntu1404": "ami-097c066ff342e12fa", + "ubuntu1604": "ami-0252c6d83fa35183c" }, "ca-central-1": { - "alinux": "ami-059dda8ee9af7c20c", - "centos6": "ami-0416d41ed6dbc0bd7", - "centos7": "ami-0c2a8b54dfd0c405f", - "ubuntu1404": "ami-08aeb7a57f73b58ab", - "ubuntu1604": "ami-039a1b0ada060b5ce" + "alinux": "ami-0ef3d34adf231688a", + "centos6": "ami-0f5cabbce86422cc0", + "centos7": "ami-0a2b442fa56fe5db0", + "ubuntu1404": "ami-075be3409c2c2738d", + "ubuntu1604": "ami-090b3ee62a0b5412b" }, "cn-north-1": { - "alinux": "ami-00237da8e056b0936", - "ubuntu1404": "ami-00f2cae5406fb3fce", - "ubuntu1604": "ami-072046713a0458796" + "alinux": "ami-053a5a11a4ac83842", + "ubuntu1404": "ami-0e90250aa31ac4a12", + "ubuntu1604": "ami-0b89a3058ea57b76f" }, "cn-northwest-1": { - "alinux": "ami-0974fc483e449f5ee" + "alinux": "ami-0c5a99564e44467e7" }, "eu-central-1": { - "alinux": "ami-09cff6787920e967c", - "centos6": "ami-0b1f52047bb2b7f83", - "centos7": "ami-00b3f34240b6021dd", - "ubuntu1404": "ami-0b24a435216670b4a", - "ubuntu1604": "ami-0d816068d1164f4d2" + "alinux": "ami-01d7252afc45b0d8b", + "centos6": "ami-0c47523135cb69662", + "centos7": "ami-089c27327ed56f33c", + "ubuntu1404": "ami-093d2529752e5e2f2", + "ubuntu1604": "ami-0c7ccc7ec89bd0d75" }, "eu-north-1": { - "alinux": "ami-086f4f382fb1119f0", - "centos6": "ami-04fc976e8108996e6", - "centos7": "ami-07ac74edc9e96b343", - "ubuntu1404": "ami-0921b515f8ed512c3", - "ubuntu1604": "ami-046c32486a9abf742" + "alinux": "ami-02224ba7786413561", + "centos6": "ami-064af23ad3c8bac88", + "centos7": "ami-016fe6e29bff94e38", + "ubuntu1404": "ami-01ea61e8b26299de1", + "ubuntu1604": "ami-0675d81a659cb530e" }, "eu-west-1": { - "alinux": "ami-080a7d5c75253bd1b", - "centos6": "ami-073f1f5db6cfdd3d1", - "centos7": "ami-0879d97613ba8075a", - "ubuntu1404": "ami-076fbdec21cd5c940", - "ubuntu1604": "ami-0f641e63ebaf647b1" + "alinux": "ami-0a32ae196621ce1cd", + "centos6": "ami-048edb0ed06f91ef8", + "centos7": "ami-00e9df6764697605f", + "ubuntu1404": "ami-0d1cb938a6d6b693c", + "ubuntu1604": "ami-010af7d7788dac778" }, "eu-west-2": { - "alinux": "ami-0b284f1028a743865", - "centos6": "ami-0ace56d1d9c1aa466", - "centos7": "ami-06aa0ca6f28c29fdb", - "ubuntu1404": "ami-0fb01b5b56bc27509", - "ubuntu1604": "ami-067c1c0157477c166" + "alinux": "ami-0e8d810df84d61f8b", + "centos6": "ami-098b95acdef72fd6c", + "centos7": "ami-00884b56887c9585a", + "ubuntu1404": "ami-0487ce94890b5869b", + "ubuntu1604": "ami-01290e9a5298ae134" }, "eu-west-3": { - "alinux": "ami-099197e40d0c5de6e", - "centos6": "ami-0d377bccde07c887d", - "centos7": "ami-0b28dcaf2b4d00d87", - "ubuntu1404": "ami-0ea8b79c622d0a4a2", - "ubuntu1604": "ami-0102caf8c6ec0768a" + "alinux": "ami-09a1847f7683c6351", + "centos6": "ami-0e25f1a2ef5f8d237", + "centos7": "ami-0252465d6dff43cfb", + "ubuntu1404": "ami-01a321e226dbf081a", + "ubuntu1604": "ami-0996ed0efb2b8cc4a" }, "sa-east-1": { - "alinux": "ami-02e542e4935ff9647", - "centos6": "ami-0b1c8f6aad337d5b6", - "centos7": "ami-00a570d9536621660", - "ubuntu1404": "ami-00cf3910c959e9fd5", - "ubuntu1604": "ami-0ef5c70aec338bcfb" + "alinux": "ami-03c4f6e67dcea925d", + "centos6": "ami-07bcda88df87d3db2", + "centos7": "ami-02f4e6da1e0de2347", + "ubuntu1404": "ami-02366044e8d7372fa", + "ubuntu1604": "ami-0a3ef670136cc81ee" }, "us-east-1": { - "alinux": "ami-0cd2dd3198972a68c", - "centos6": "ami-0919d912e0e33d247", - "centos7": "ami-09d092c1b054832df", - "ubuntu1404": "ami-095016c5ff0ab7ae6", - "ubuntu1604": "ami-098c8e582ca818cff" + "alinux": "ami-096b5898281e68ea3", + "centos6": "ami-0fd0ae64eb509cc23", + "centos7": "ami-08b43bfe7a0d16eeb", + "ubuntu1404": "ami-00f8fb8ed06007c1f", + "ubuntu1604": "ami-05f79ab77fc13e20a" }, "us-east-2": { - "alinux": "ami-057e70f0fbb007ab6", - "centos6": "ami-0bbd43b2b8991cdae", - "centos7": "ami-0e0cda1cdec7fcecf", - "ubuntu1404": "ami-02a21f90af8a453f0", - "ubuntu1604": "ami-055279b0b09d12a71" + "alinux": "ami-0c8b41c511db3c17c", + "centos6": "ami-09c42f6a076482eeb", + "centos7": "ami-0c71e12e95cc937ef", + "ubuntu1404": "ami-0e23f662335e169a6", + "ubuntu1604": "ami-08a675767416c627a" }, "us-gov-east-1": { - "alinux": "ami-022f96b137a63c9ff", - "ubuntu1404": "ami-09274e2587c4154f2", - "ubuntu1604": "ami-0b664ce8c427b77eb" + "alinux": "ami-08cff3f9ef830bdfb", + "ubuntu1404": "ami-0a34c5de0403642d2", + "ubuntu1604": "ami-08b15aaa28124fac1" }, "us-gov-west-1": { - "alinux": "ami-fefb989f", - "ubuntu1404": "ami-6cf0930d", - "ubuntu1604": "ami-5ef6953f" + "alinux": "ami-afa0c8ce", + "ubuntu1404": "ami-e4a3cb85", + "ubuntu1604": "ami-eba3cb8a" }, "us-west-1": { - "alinux": "ami-09686d4090e35a702", - "centos6": "ami-0ac1cf1e68288fa36", - "centos7": "ami-084c065e503e74449", - "ubuntu1404": "ami-099ebeb456dbf0646", - "ubuntu1604": "ami-0db5e85c0b1ce2c20" + "alinux": "ami-08e9806b160f9aa44", + "centos6": "ami-060bcf74fb6278986", + "centos7": "ami-0f925f166f349eaa4", + "ubuntu1404": "ami-021201edbaab31f29", + "ubuntu1604": "ami-059b8cd1b0041071a" }, "us-west-2": { - "alinux": "ami-0c588cdc9e91b0db3", - "centos6": "ami-08b28682da5721f5b", - "centos7": "ami-070345cb145f2d2f0", - "ubuntu1404": "ami-05ddc7ec0edb9069f", - "ubuntu1604": "ami-02393fa61ac61547a" + "alinux": "ami-04eba5b9de0d94ec6", + "centos6": "ami-009c08111f57a2d4c", + "centos7": "ami-05f6a646767bf0c87", + "ubuntu1404": "ami-085bb9cfafd2e3b3a", + "ubuntu1604": "ami-034bffb8da06d6951" } }, "OSFeatures": { @@ -1343,8 +1384,8 @@ }, "PackagesVersions": { "default": { - "parallelcluster": "2.1.1", - "cookbook": "aws-parallelcluster-cookbook-2.1.1", + "parallelcluster": "2.2.1", + "cookbook": "aws-parallelcluster-cookbook-2.2.1", "chef": "14.2.0", "ridley": "5.1.1", "berkshelf": "7.0.4", @@ -1407,6 +1448,54 @@ "MessageRetentionPeriod": 1209600 } }, + "FSXSubstack": { + "Type": "AWS::CloudFormation::Stack", + "Properties": { + "Parameters": { + "FSXOptions": { + "Ref": "FSXOptions" + }, + "ComputeSecurityGroup": { + "Fn::If": [ + "CreateSecurityGroups", + { + "Ref": "ComputeSecurityGroup" + }, + { + "Ref": "VPCSecurityGroupId" + } + ] + }, + "SubnetId": { + "Ref": "MasterSubnetId" + } + }, + "TemplateURL": { + "Fn::Sub": [ + "https://${s3_domain}/${AWS::Region}-aws-parallelcluster/templates/fsx-substack-${version}.cfn.json", + { + "s3_domain": { + "Fn::If": [ + "GovCloudRegion", + { + "Fn::Sub": "s3-${AWS::Region}.amazonaws.com" + }, + "s3.amazonaws.com" + ] + }, + "version": { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "parallelcluster" + ] + } + } + ] + } + }, + "Condition": "CreateFSXSubstack" + }, "SQSPolicy": { "Type": "AWS::SQS::QueuePolicy", "Properties": { @@ -2265,8 +2354,7 @@ "chefPrepEnv", "shellRunPreInstall", "chefConfig", - "shellRunPostInstall", - "shellForkClusterReadyInstall" + "shellRunPostInstall" ] }, "deployConfigFiles": { @@ -2330,6 +2418,21 @@ "cfn_efs_shared_dir": { "Ref": "EFSOptions" }, + "cfn_fsx_fs_id": { + "Fn::If": [ + "CreateFSXSubstack", + { + "Fn::GetAtt": [ + "FSXSubstack", + "Outputs.FileSystemId" + ] + }, + "" + ] + }, + "cfn_fsx_options": { + "Ref": "FSXOptions" + }, "cfn_volume": { "Fn::GetAtt": [ "EBSCfnStack", @@ -2436,10 +2539,13 @@ "getCookbooks": { "commands": { "berk": { - "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", + "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" -o \"$custom_cookbook\" != \"NONE\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", "cwd": "/tmp/cookbooks", "env": { "HOME": "/tmp", + "custom_cookbook": { + "Ref": "CustomChefCookbook" + }, "parallelcluster_version": { "Fn::Join": [ "", @@ -2488,13 +2594,6 @@ "command": "/opt/parallelcluster/scripts/fetch_and_run -postinstall" } } - }, - "shellForkClusterReadyInstall": { - "commands": { - "clusterreadyinstall": { - "command": "/opt/parallelcluster/scripts/fetch_and_run -clusterreadyinstall" - } - } } } }, @@ -3096,7 +3195,6 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "shellForkClusterReadyInstall", "signalComputeReady" ] }, @@ -3144,6 +3242,21 @@ "cfn_efs_shared_dir": { "Ref": "EFSOptions" }, + "cfn_fsx_fs_id": { + "Fn::If": [ + "CreateFSXSubstack", + { + "Fn::GetAtt": [ + "FSXSubstack", + "Outputs.FileSystemId" + ] + }, + "" + ] + }, + "cfn_fsx_options": { + "Ref": "FSXOptions" + }, "cfn_scheduler": { "Ref": "Scheduler" }, @@ -3241,10 +3354,13 @@ "getCookbooks": { "commands": { "berk": { - "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", + "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" -o \"$custom_cookbook\" != \"NONE\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", "cwd": "/tmp/cookbooks", "env": { "HOME": "/tmp", + "custom_cookbook": { + "Ref": "CustomChefCookbook" + }, "parallelcluster_version": { "Fn::Join": [ "", @@ -3294,13 +3410,6 @@ } } }, - "shellForkClusterReadyInstall": { - "commands": { - "clusterreadyinstall": { - "command": "/opt/parallelcluster/scripts/fetch_and_run -clusterreadyinstall" - } - } - }, "signalComputeReady": { "commands": { "compute_ready": { @@ -3769,7 +3878,6 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "shellForkClusterReadyInstall", "signalComputeReady" ] }, @@ -3817,6 +3925,21 @@ "cfn_efs_shared_dir": { "Ref": "EFSOptions" }, + "cfn_fsx_fs_id": { + "Fn::If": [ + "CreateFSXSubstack", + { + "Fn::GetAtt": [ + "FSXSubstack", + "Outputs.FileSystemId" + ] + }, + "" + ] + }, + "cfn_fsx_options": { + "Ref": "FSXOptions" + }, "cfn_scheduler": { "Ref": "Scheduler" }, @@ -3914,10 +4037,13 @@ "getCookbooks": { "commands": { "berk": { - "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", + "command": "if [ ! -f /opt/parallelcluster/.bootstrapped -o \"$(cat /opt/parallelcluster/.bootstrapped)\" != \"$parallelcluster_version\" -o \"$custom_cookbook\" != \"NONE\" ]; then . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done; fi", "cwd": "/tmp/cookbooks", "env": { "HOME": "/tmp", + "custom_cookbook": { + "Ref": "CustomChefCookbook" + }, "parallelcluster_version": { "Fn::Join": [ "", @@ -3967,13 +4093,6 @@ } } }, - "shellForkClusterReadyInstall": { - "commands": { - "clusterreadyinstall": { - "command": "/opt/parallelcluster/scripts/fetch_and_run -clusterreadyinstall" - } - } - }, "signalComputeReady": { "commands": { "compute_ready": { diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 43de764753..694d1dd74d 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -375,7 +375,10 @@ "Properties": { "Type": "MANAGED", "ServiceRole": { - "Ref": "BatchServiceRole" + "Fn::GetAtt": [ + "BatchServiceRole", + "Arn" + ] }, "ComputeResources": { "Type": { @@ -421,7 +424,10 @@ "Fn::If": [ "UseSpot", { - "Ref": "SpotIamFleetRole" + "Fn::GetAtt": [ + "SpotIamFleetRole", + "Arn" + ] }, { "Ref": "AWS::NoValue" @@ -627,7 +633,10 @@ "Fn::Sub": "${ClusterName}-build-docker-images-project" }, "ServiceRole": { - "Ref": "CodeBuildRole" + "Fn::GetAtt": [ + "CodeBuildRole", + "Arn" + ] }, "Source": { "Location": { diff --git a/cloudformation/fsx-substack.cfn.json b/cloudformation/fsx-substack.cfn.json new file mode 100644 index 0000000000..a1a549d08c --- /dev/null +++ b/cloudformation/fsx-substack.cfn.json @@ -0,0 +1,292 @@ +{ + "Conditions": { + "CreateFSX": { + "Fn::And": [ + { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "0", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + { + "Fn::Equals": [ + { + "Fn::Select": [ + "1", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseExportPath": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "5", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseFSXKMSKey": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseImportPath": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "6", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseImportedFileChunkSize": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "4", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseStorageCap": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "2", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + }, + "UseWeeklyMaintenanceStartTime": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "7", + { + "Ref": "FSXOptions" + } + ] + }, + "NONE" + ] + } + ] + } + }, + "Outputs": { + "FileSystemId": { + "Description": "ID of the FileSystem", + "Value": { + "Fn::If": [ + "CreateFSX", + { + "Ref": "FileSystem" + }, + { + "Fn::Select": [ + "1", + { + "Ref": "FSXOptions" + } + ] + } + ] + } + } + }, + "Parameters": { + "ComputeSecurityGroup": { + "Description": "SecurityGroup for FSx filesystem", + "Type": "String" + }, + "FSXOptions": { + "Description": "Comma separated list of fsx related options, 8 parameters in total, [shared_dir,fsx_fs_id,storage_capacity,fsx_kms_key_id,imported_file_chunk_size,export_path,import_path,weekly_maintenance_start_time]", + "Type": "CommaDelimitedList" + }, + "SubnetId": { + "Description": "SubnetId for FSx filesystem", + "Type": "String" + } + }, + "Resources": { + "FileSystem": { + "Condition": "CreateFSX", + "Properties": { + "FileSystemType": "LUSTRE", + "KmsKeyId": { + "Fn::If": [ + "UseFSXKMSKey", + { + "Fn::Select": [ + "3", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "LustreConfiguration": { + "ExportPath": { + "Fn::If": [ + "UseExportPath", + { + "Fn::Select": [ + "5", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "ImportPath": { + "Fn::If": [ + "UseImportPath", + { + "Fn::Select": [ + "6", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "ImportedFileChunkSize": { + "Fn::If": [ + "UseImportedFileChunkSize", + { + "Fn::Select": [ + "4", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "WeeklyMaintenanceStartTime": { + "Fn::If": [ + "UseWeeklyMaintenanceStartTime", + { + "Fn::Select": [ + "7", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + } + }, + "SecurityGroupIds": [ + { + "Ref": "ComputeSecurityGroup" + } + ], + "StorageCapacity": { + "Fn::If": [ + "UseStorageCap", + { + "Fn::Select": [ + "2", + { + "Ref": "FSXOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "SubnetIds": [ + { + "Ref": "SubnetId" + } + ] + }, + "Type": "AWS::FSx::FileSystem" + } + } +} diff --git a/cloudformation/utils/json_formatter.py b/cloudformation/utils/json_formatter.py index 9b925baf26..927a6e0479 100644 --- a/cloudformation/utils/json_formatter.py +++ b/cloudformation/utils/json_formatter.py @@ -1,8 +1,9 @@ -import argparse import json from collections import OrderedDict from glob import glob +import argparse + def _parse_args(): parser = argparse.ArgumentParser(description="Formats a json document.") diff --git a/docs/conf.py b/docs/conf.py index 31863370e7..24b346279a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,9 +51,9 @@ # built documents. # # The short X.Y version. -version = '2.1' +version = '2.2' # The full version, including alpha/beta/rc tags. -release = '2.1.1' +release = '2.2.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index c047852b2f..b4f63c7299 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2,14 +2,19 @@ Configuration ============= .. toctree:: -pcluster uses the file ``~/.parallelcluster/config`` by default for all configuration parameters. +ParallelCluster uses the file ``~/.parallelcluster/config`` by default for all configuration parameters. + +An example configuration file can be found at ``site-packages/aws-parallelcluster/examples/config``. -You can see an example configuration file ``site-packages/aws-parallelcluster/examples/config`` Layout ------ -Configuration is defined in multiple sections. Required sections are "global", "aws", one "cluster", and one "subnet". +Configuration is defined in multiple sections. + +Required sections are "global" and "aws". + +At least one "cluster" and one "subnet" section must be included. A section starts with the section name in brackets, followed by parameters and configuration. :: @@ -30,7 +35,7 @@ Global configuration options related to pcluster. :: cluster_template """""""""""""""" -The name of the cluster section used for the cluster. +Defines the name of the cluster section used for the cluster. See the :ref:`Cluster Definition `. :: @@ -38,23 +43,24 @@ See the :ref:`Cluster Definition `. :: update_check """""""""""" -Whether or not to check for updates to pcluster. :: +Check for updates to pcluster. :: update_check = true sanity_check """""""""""" -Attempts to validate that resources defined in parameters actually exist. :: +Attempt to validate the existence of the resources defined in parameters. :: sanity_check = true aws ^^^ -This is the AWS credentials/region section (required). These settings apply to all clusters. +AWS credentials/region section. + +These settings apply to all clusters and are REQUIRED. -We highly recommend use of the environment, EC2 IAM Roles, or storing credentials using the `AWS CLI -`_ to store credentials, rather than -storing them in the AWS ParallelCluster config file. :: +For security purposes, AWS highly recommends using the environment, EC2 IAM Roles, or the +`AWS CLI `_ to store credentials rather than saving into the AWS ParallelCluster config file. :: [aws] aws_access_key_id = #your_aws_access_key_id @@ -63,12 +69,13 @@ storing them in the AWS ParallelCluster config file. :: # Defaults to us-east-1 if not defined in environment or below aws_region_name = #region - aliases ^^^^^^^ -This is the aliases section. Use this section to customize the `ssh` command. +Aliases section. + +Customize the `ssh` command here. -`CFN_USER` is set to the default username for the os. +`CFN_USER` is set to the default username for the OS. `MASTER_IP` is set to the IP address of the master instance. `ARGS` is set to whatever arguments the user provides after `pcluster ssh cluster_name`. :: @@ -81,9 +88,9 @@ This is the aliases section. Use this section to customize the `ssh` command. cluster ^^^^^^^ -You can define one or more clusters for different types of jobs or workloads. +Defines one or more clusters for different job types or workloads. -Each cluster has it's own configuration based on your needs. +Each cluster can have its own individual configuration. The format is [cluster ]. :: @@ -97,7 +104,7 @@ Name of an existing EC2 KeyPair to enable SSH access to the instances. :: template_url """""""""""" -Overrides the path to the CloudFormation template used to create the cluster +Defines the path to the CloudFormation template used to create the cluster. Defaults to ``https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json``. :: @@ -106,10 +113,10 @@ Defaults to compute_instance_type """"""""""""""""""""" -The EC2 instance type used for the cluster compute nodes. +Defines the EC2 instance type used for the cluster compute nodes. -If you're using awsbatch, please refer to the Compute Environments creation in the AWS Batch UI for the list of the -supported instance types. +If the scheduler is awsbatch, please refer to the Compute Environments creation in the +AWS Batch UI for the list of supported instance types. Defaults to t2.micro, ``optimal`` when scheduler is awsbatch :: @@ -117,9 +124,9 @@ Defaults to t2.micro, ``optimal`` when scheduler is awsbatch :: master_instance_type """""""""""""""""""" -The EC2 instance type use for the master node. +Defines the EC2 instance type used for the master node. -This defaults to t2.micro. :: +Defaults to t2.micro. :: master_instance_type = t2.micro @@ -127,11 +134,13 @@ This defaults to t2.micro. :: initial_queue_size """""""""""""""""" -The initial number of EC2 instances to launch as compute nodes in the cluster for traditional schedulers. +Set the initial number of EC2 instances to launch as compute nodes in the cluster. -If you're using awsbatch, use :ref:`min_vcpus `. +This setting is applicable only for traditional schedulers (sge, slurm, and torque). -The default is 2. :: +If the scheduler is awsbatch, use :ref:`min_vcpus `. + +Defaults to 2. :: initial_queue_size = 2 @@ -139,25 +148,27 @@ The default is 2. :: max_queue_size """""""""""""" -The maximum number of EC2 instances that can be launched in the cluster for traditional schedulers. +Set the maximum number of EC2 instances that can be launched in the cluster. -If you're using awsbatch, use :ref:`max_vcpus `. +This setting is applicable only for traditional schedulers (sge, slurm, and torque). -This defaults to 10. :: +If the scheduler is awsbatch, use :ref:`max_vcpus `. + +Defaults to 10. :: max_queue_size = 10 maintain_initial_size """"""""""""""""""""" -Boolean flag to set autoscaling group to maintain initial size for traditional schedulers. +Boolean flag to maintain initial size of the Auto Scaling group for traditional schedulers. -If you're using awsbatch, use :ref:`desired_vcpus `. +If the scheduler is awsbatch, use :ref:`desired_vcpus `. -If set to true, the Auto Scaling group will never have fewer members than the value of initial_queue_size. It will -still allow the cluster to scale up to the value of max_queue_size. +If set to true, the Auto Scaling group will never have fewer members than the value +of initial_queue_size. The cluster can still scale up to the value of max_queue_size. -Setting to false allows the Auto Scaling group to scale down to 0 members, so resources will not sit idle when they -aren't needed. +If set to false, the Auto Scaling group can scale down to 0 members to prevent resources +from sitting idle when they are not needed. Defaults to false. :: @@ -167,7 +178,7 @@ Defaults to false. :: min_vcpus """"""""" -If scheduler is awsbatch, the compute environment won't have fewer than min_vcpus. +If the scheduler is awsbatch, the compute environment will never have fewer than min_vcpus. Defaults to 0. :: @@ -177,7 +188,7 @@ Defaults to 0. :: desired_vcpus """"""""""""" -If scheduler is awsbatch, the compute environment will initially have desired_vcpus +If the scheduler is awsbatch, the compute environment will initially have desired_vcpus. Defaults to 4. :: @@ -187,17 +198,19 @@ Defaults to 4. :: max_vcpus """"""""" -If scheduler is awsbatch, the compute environment will at most have max_vcpus. +If the scheduler is awsbatch, the compute environment will at most have max_vcpus. Defaults to 20. :: - desired_vcpus = 20 + max_vcpus = 20 scheduler """"""""" -Scheduler to be used with the cluster. Valid options are sge, torque, slurm, or awsbatch. +Defines the cluster scheduler. -If you're using awsbatch, please take a look at the :ref:`networking setup `. +Valid options are sge, torque, slurm, or awsbatch. + +If the scheduler is awsbatch, please take a look at the :ref:`networking setup `. Defaults to sge. :: @@ -205,7 +218,9 @@ Defaults to sge. :: cluster_type """""""""""" -Type of cluster to launch i.e. ondemand or spot +Defines the type of cluster to launch. + +Valid options are ondemand or spot. Defaults to ondemand. :: @@ -213,13 +228,13 @@ Defaults to ondemand. :: spot_price """""""""" -If cluster_type is set to spot, you can optionally set the maximum spot price for the ComputeFleet on traditional -schedulers. If you do not specify a value, you are charged the Spot price, capped at the On-Demand price. +If cluster_type is set to spot, you can optionally set the maximum spot price for the +ComputeFleet on traditional schedulers. If you do not specify a value, you are charged the +Spot price, capped at the On-Demand price. -If you're using awsbatch, use :ref:`spot_bid_percentage `. +If the scheduler is awsbatch, use :ref:`spot_bid_percentage `. -See the `Spot Bid Advisor `_ for assistance finding a bid price that -meets your needs:: +See the `Spot Bid Advisor `_ for assistance finding a bid price that meets your needs. :: spot_price = 1.50 @@ -227,26 +242,26 @@ meets your needs:: spot_bid_percentage """"""""""""""""""" -If you're using awsbatch as your scheduler, this optional parameter is the on-demand bid percentage. If not specified -you'll get the current spot market price, capped at the on-demand price. :: +If awsbatch is the scheduler, this optional parameter is the on-demand bid percentage. - spot_price = 85 +If unspecified, the current spot market price will be selected, capped at the on-demand price. :: + + spot_bid_percentage = 85 .. _custom_ami_section: custom_ami """""""""" -ID of a Custom AMI, to use instead of default `published AMI's -`_. :: +ID of a Custom AMI to use instead of the default `published AMIs `_. :: custom_ami = NONE s3_read_resource """""""""""""""" -Specify S3 resource for which AWS ParallelCluster nodes will be granted read-only access +Specify an S3 resource to which AWS ParallelCluster nodes will be granted read-only access. -For example, 'arn:aws:s3:::my_corporate_bucket/\*' would provide read-only access to all objects in the -my_corporate_bucket bucket. +For example, 'arn:aws:s3:::my_corporate_bucket/\*' would provide read-only access to all +objects in the my_corporate_bucket bucket. See :doc:`working with S3 ` for details on format. @@ -256,10 +271,10 @@ Defaults to NONE. :: s3_read_write_resource """""""""""""""""""""" -Specify S3 resource for which AWS ParallelCluster nodes will be granted read-write access +Specify an S3 resource to which AWS ParallelCluster nodes will be granted read-write access. -For example, 'arn:aws:s3:::my_corporate_bucket/Development/\*' would provide read-write access to all objects in the -Development folder of the my_corporate_bucket bucket. +For example, 'arn:aws:s3:::my_corporate_bucket/Development/\*' would provide read-write +access to all objects in the Development folder of the my_corporate_bucket bucket. See :doc:`working with S3 ` for details on format. @@ -269,11 +284,11 @@ Defaults to NONE. :: pre_install """"""""""" -URL to a preinstall script. This is executed before any of the boot_as_* scripts are run +URL to a preinstall script that is executed before any of the boot_as_* scripts are run. -This only gets executed on the master node when using awsbatch as your scheduler. +When using awsbatch as the scheduler, the preinstall script is only executed on the master node. -Can be specified in "http://hostname/path/to/script.sh" or "s3://bucketname/path/to/script.sh" format. +The parameter format can be specified as "http://hostname/path/to/script.sh" or "s3://bucketname/path/to/script.sh". Defaults to NONE. :: @@ -281,7 +296,7 @@ Defaults to NONE. :: pre_install_args """""""""""""""" -Quoted list of arguments to be passed to preinstall script +Quoted list of arguments to be passed to the preinstall script. Defaults to NONE. :: @@ -289,9 +304,9 @@ Defaults to NONE. :: post_install """""""""""" -URL to a postinstall script. This is executed after any of the boot_as_* scripts are run +URL to a postinstall script that is executed after all of the boot_as_* scripts are run. -This only gets executed on the master node when using awsbatch as your scheduler. +When using awsbatch as the scheduler, the postinstall script is only executed on the master node. Can be specified in "http://hostname/path/to/script.sh" or "s3://bucketname/path/to/script.sh" format. @@ -301,7 +316,7 @@ Defaults to NONE. :: post_install_args """"""""""""""""" -Arguments to be passed to postinstall script +Arguments to be passed to the postinstall script. Defaults to NONE. :: @@ -309,7 +324,7 @@ Defaults to NONE. :: proxy_server """""""""""" -HTTP(S) proxy server, typically http://x.x.x.x:8080 +Defines an HTTP(S) proxy server, typically http://x.x.x.x:8080. Defaults to NONE. :: @@ -317,31 +332,38 @@ Defaults to NONE. :: placement_group """"""""""""""" -Cluster placement group. The can be one of three values: NONE, DYNAMIC and an existing placement group name. When -DYNAMIC is set, a unique placement group will be created as part of the cluster and deleted when the cluster is deleted. +Defines the cluster placement group. -This does not apply to awsbatch. +Valid options are NONE, DYNAMIC or an existing EC2 placement group name. -Defaults to NONE. More information on placement groups can be found `here -`_:: +When DYNAMIC is set, a unique placement group will be created and deleted as part +of the cluster stack. + +This parameter does not apply to awsbatch. + +More information on placement groups can be found `here `_ + +Defaults to NONE. :: placement_group = NONE placement """"""""" -Cluster placement logic. This enables the whole cluster or only compute to use the placement group. +Defines the cluster placement group logic. -Can be ``cluster`` or ``compute``. +This enables the whole cluster or only the compute instances to use the placement group. -This does not apply to awsbatch. +Valid options are ``cluster`` or ``compute``. + +This parameter does not apply to awsbatch. -Defaults to ``cluster``. :: +Defaults to ``compute``. :: - placement = cluster + placement = compute ephemeral_dir """"""""""""" -If instance store volumes exist, this is the path/mountpoint they will be mounted on. +If instance store volumes exist, define the path where they will be mounted. Defaults to /scratch. :: @@ -349,18 +371,24 @@ Defaults to /scratch. :: shared_dir """""""""" -Path/mountpoint for shared EBS volume. Do not use this option when using multiple EBS volumes; provide shared_dir under -each EBS section instead +Defines the path where the shared EBS volume will be mounted. + +Do not use this option with multiple EBS volumes. Provide shared_dir under each EBS section instead. + +See :ref:`EBS Section ` for details on working with multiple EBS volumes. -Defaults to /shared. The example below mounts to /myshared. See :ref:`EBS Section ` for details on working -with multiple EBS volumes:: +Defaults to /shared. + +The example below mounts the shared EBS volume at /myshared. :: shared_dir = myshared encrypted_ephemeral """"""""""""""""""" -Encrypted ephemeral drives. In-memory keys, non-recoverable. If true, AWS ParallelCluster will generate an ephemeral -encryption key in memory and using LUKS encryption, encrypt your instance store volumes. +Encrypt the ephemeral instance store volumes with non-recoverable in-memory keys +using LUKS (Linux Unified Key Setup). + +Please visit https://guardianproject.info/code/luks/ for more information. Defaults to false. :: @@ -368,7 +396,7 @@ Defaults to false. :: master_root_volume_size """"""""""""""""""""""" -MasterServer root volume size in GB. (AMI must support growroot) +MasterServer root volume size in GB. The AMI must support growroot. Defaults to 15. :: @@ -376,7 +404,7 @@ Defaults to 15. :: compute_root_volume_size """""""""""""""""""""""" -ComputeFleet root volume size in GB. (AMI must support growroot) +ComputeFleet root volume size in GB. The AMI must support growroot. Defaults to 15. :: @@ -384,36 +412,38 @@ Defaults to 15. :: base_os """"""" -OS type used in the cluster +OS type used in the cluster. -Defaults to alinux. Available options are: alinux, centos6, centos7, ubuntu1404 and ubuntu1604 +Available options are: alinux, centos6, centos7, ubuntu1404 and ubuntu1604. -Note: The base_os determines the username used to log into the cluster. +Supported operating systems by region are listed in the table below. Please note +that commercial entails all supported regions including us-east-1, us-west-2, etc.:: -Supported OS's by region. Note that commercial is all supported regions such as us-east-1, us-west-2 etc. :: + ============== ====== ============ ============ ============= ============ + region alinux centos6 centos7 ubuntu1404 ubuntu1604 + ============== ====== ============ ============ ============= ============ + commercial True True True True True + us-gov-west-1 True False False True True + us-gov-east-1 True False False True True + cn-north-1 True False False True True + cn-northwest-1 True False False False False + ============== ====== ============ ============ ============= ============ - ============== ====== ============ ============ ============= ============ - region alinux centos6 centos7 ubuntu1404 ubuntu1604 - ============== ====== ============ ============ ============= ============ - commercial True True True True True - us-gov-west-1 True False False True True - us-gov-east-1 True False False True True - cn-north-1 True False False True True - cn-northwest-1 True False False False False - ============== ====== ============ ============ ============= ============ +Note: The base_os determines the username used to log into the cluster. * CentOS 6 & 7: ``centos`` -* Ubuntu: ``ubuntu`` -* Amazon Linux: ``ec2-user`` :: +* Ubuntu 14.04 LTS & 16.04 LTS: ``ubuntu`` +* Amazon Linux: ``ec2-user`` + +Defaults to alinux. :: base_os = alinux ec2_iam_role """""""""""" -The given name of an existing EC2 IAM Role that will be attached to all -instances in the cluster. Note that the given name of a role and its Amazon -Resource Name (ARN) are different, and the latter can not be used as an argument -to ec2_iam_role. +Defines the name of an existing EC2 IAM Role that will be attached to all instances in +the cluster. Note that the given name of a role and its Amazon Resource Name (ARN) are +different, and the latter may not be used as an argument to ec2_iam_role. Defaults to NONE. :: @@ -429,19 +459,19 @@ Defaults to {}. :: additional_cfn_template """"""""""""""""""""""" -An additional CloudFormation template to launch along with the cluster. This allows you to create resources that exist -outside of the cluster but are part of the cluster's life cycle. +Defines an additional CloudFormation template to launch along with the cluster. This +allows for the creation of resources that exist outside of the cluster but are part +of the cluster's life cycle. -Must be a HTTP URL to a public template with all parameters provided. +This value must be a HTTP URL to a public template with all parameters provided. Defaults to NONE. :: additional_cfn_template = NONE - vpc_settings """""""""""" -Settings section relating to VPC to be used +Settings section for the VPC where the cluster will be deployed. See :ref:`VPC Section `. :: @@ -449,8 +479,10 @@ See :ref:`VPC Section `. :: ebs_settings """""""""""" -Settings section relating to EBS volume mounted on the master. When using multiple EBS volumes, enter multiple settings -as a comma separated list. Up to 5 EBS volumes are supported. +Settings section related to the EBS volume mounted on the master instance. When using +multiple EBS volumes, enter these parameters as a comma separated list. + +Up to five (5) additional EBS volumes are supported. See :ref:`EBS Section `. :: @@ -458,7 +490,7 @@ See :ref:`EBS Section `. :: scaling_settings """""""""""""""" -Settings section relation to scaling +Settings section relating to autoscaling configuration. See :ref:`Scaling Section `. :: @@ -466,7 +498,7 @@ See :ref:`Scaling Section `. :: efs_settings """""""""""" -Settings section relating to EFS filesystem +Settings section relating to EFS filesystem. See :ref:`EFS Section `. :: @@ -474,24 +506,31 @@ See :ref:`EFS Section `. :: raid_settings """"""""""""" -Settings section relating to RAID drive configuration. +Settings section relating to EBS volume RAID configuration. See :ref:`RAID Section `. :: raid_settings = rs +fsx_settings +"""""""""""" +Settings section relating to FSx Lustre configuration. + +See :ref:`FSx Section `. :: + + fsx_settings = fs + tags """" -Defines tags to be used in CloudFormation. +Defines tags to be used by CloudFormation. -If command line tags are specified via `--tags`, they get merged with config tags. +If command line tags are specified via `--tags`, they will be merged with config tags. Command line tags overwrite config tags that have the same key. -Tags are JSON formatted and should not have quotes outside the curly braces. +Tags are JSON formatted and should never have quotes outside the curly braces. -See `AWS CloudFormation Resource Tags Type -`_. :: +See `AWS CloudFormation Resource Tags Type `_. :: tags = {"key" : "value", "key2" : "value2"} @@ -507,21 +546,21 @@ VPC Configuration Settings:: vpc_id """""" -ID of the VPC you want to provision cluster into. :: +ID of the VPC to provision cluster into. :: vpc_id = vpc-xxxxxx master_subnet_id """""""""""""""" -ID of an existing subnet you want to provision the Master server into. :: +ID of an existing subnet to provision the Master server into. :: master_subnet_id = subnet-xxxxxx ssh_from """""""" -CIDR formatted IP range in which to allow SSH access from. +CIDR-formatted IP range to allow SSH access from. -This is only used when AWS ParallelCluster creates the security group. +This parameter is only used when AWS ParallelCluster creates the security group. Defaults to 0.0.0.0/0. :: @@ -537,24 +576,25 @@ Defaults to NONE. :: compute_subnet_id """"""""""""""""" -ID of an existing subnet you want to provision the compute nodes into. +ID of an existing subnet to provision the compute nodes into. -If it is private, you need to setup NAT for web access. :: +If the subnet is private, you will need to setup NAT for web access. :: compute_subnet_id = subnet-xxxxxx compute_subnet_cidr """"""""""""""""""" -If you wish for AWS ParallelCluster to create a compute subnet, this is the CIDR that. :: +If you want AWS ParallelCluster to create a compute subnet, designate the CIDR block here. :: compute_subnet_cidr = 10.0.100.0/24 use_public_ips """""""""""""" -Define whether or not to assign public IP addresses to Compute EC2 instances. +Defines whether or not to assign public IP addresses to compute instances. If true, an Elastic IP will be associated to the Master instance. -If false, the Master instance will have a Public IP or not according to the value + +If false, the Master instance will have a Public IP (or not) according to the value of the "Auto-assign Public IP" subnet configuration parameter. See :ref:`networking configuration ` for some examples. @@ -575,7 +615,8 @@ Defaults to NONE. :: ebs ^^^ -EBS Volume configuration settings for the volumes mounted on the master node and shared via NFS to compute nodes. :: +EBS volume configuration settings for the volumes mounted on the master instance and +shared via NFS to the compute nodes. :: [ebs custom1] shared_dir = vol1 @@ -592,14 +633,18 @@ EBS Volume configuration settings for the volumes mounted on the master node and shared_dir """""""""" -Path/mountpoint for shared EBS volume. Required when using multiple EBS volumes. When using 1 ebs volume, this option -will overwrite the shared_dir specified under the cluster section. The example below mounts to /vol1 :: +Path where the shared EBS volume will be mounted. + +This parameter is required when using multiple EBS volumes. + +When using one (1) EBS volume, this option will overwrite the shared_dir specified +under the cluster section. The example below mounts to /vol1 :: shared_dir = vol1 ebs_snapshot_id """"""""""""""" -Id of EBS snapshot if using snapshot as source for volume. +Defines the EBS snapshot Id if using a snapshot as the source for the volume. Defaults to NONE. :: @@ -607,8 +652,7 @@ Defaults to NONE. :: volume_type """"""""""" -The `API name `_ for the type of volume you -wish to launch. +The `EBS volume type `_ of the volume you wish to launch. Defaults to gp2. :: @@ -624,21 +668,32 @@ Defaults to 20GB. :: volume_iops """"""""""" -Number of IOPS for io1 type volumes. :: +Defines the number of IOPS for io1 type volumes. :: volume_iops = 200 encrypted """"""""" -Whether or not the volume should be encrypted (should not be used with snapshots). +Controls if the EBS volume should be encrypted (note: this should *not* be used with snapshots). Defaults to false. :: encrypted = false +ebs_kms_key_id +"""""""""""""" +Use a custom KMS Key for encryption. + +This parameter must be used in conjunction with ``encrypted = true`` and needs to +have a custom ``ec2_iam_role``. + +See :ref:`Disk Encryption with a Custom KMS Key `. :: + + ebs_kms_key_id = xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ebs_volume_id """"""""""""" -EBS Volume Id of an existing volume that will be attached to the MasterServer. +Defines the volume Id of an existing EBS volume that will be attached to the master instance. Defaults to NONE. :: @@ -650,7 +705,6 @@ scaling ^^^^^^^ Settings which define how the compute nodes scale. :: - [scaling custom] scaledown_idletime = 10 @@ -664,15 +718,14 @@ Defaults to 10. :: scaledown_idletime = 10 - examples ^^^^^^^^ -Let's say you want to launch a cluster with the awsbatch scheduler and let batch pick the optimal instance type, based -on your jobs resource needs. +Suppose you want to launch a cluster with the awsbatch scheduler and let batch pick +the optimal instance type, based on your jobs resource needs. -The following allows a maximum of 40 concurrent vCPUs, and scales down to zero when you have no jobs running for 10 -minutes. :: +The following configuration allows a maximum of 40 concurrent vCPUs and scales down +to zero when no jobs have run for 10 minutes. :: [global] update_check = true @@ -737,8 +790,7 @@ minutes. :: EFS ^^^ -EFS file system configuration settings for the EFS mounted on the master node and compute nodes via nfs4. :: - +Defines configuration settings for the EFS mounted on the master and compute instances. :: [efs customfs] shared_dir = efs @@ -747,16 +799,19 @@ EFS file system configuration settings for the EFS mounted on the master node an shared_dir """""""""" -Shared directory that the file system will be mounted to on the master and compute nodes. +Defines the EFS mount point on the master and compute nodes. -This parameter is REQUIRED, the EFS section will only be used if this parameter is specified. -The below example mounts to /efs. Do not use NONE or /NONE as the shared directory.:: +This parameter is REQUIRED! The EFS section will only be used if shared_dir is specified. + +The example below will mount at /efs. + +Do not use NONE or /NONE as the shared directory.:: shared_dir = efs encrypted """"""""" -Whether or not the file system will be encrypted. +Defines if the file system will be encrypted. Defaults to false. :: @@ -764,61 +819,80 @@ Defaults to false. :: performance_mode """""""""""""""" -Performance Mode of the file system. We recommend generalPurpose performance mode for most file systems. -File systems using the maxIO performance mode can scale to higher levels of aggregate throughput -and operations per second with a trade-off of slightly higher latencies for most file operations. -This can't be changed after the file system has been created. +Defines the Performance Mode of the file system. + +Valid choices are generalPurpose or maxIO (these are case-sensitive). + +We recommend generalPurpose performance mode for most file systems. -Defaults generalPurpose. Valid Values are generalPurpose | maxIO (case sensitive). :: +File systems using the maxIO performance mode can scale to higher levels of aggregate +throughput and operations per second with a trade-off of slightly higher latencies for +most file operations. + +This parameter cannot be changed after the file system has been created. + +Defaults to generalPurpose.:: performance_mode = generalPurpose throughput_mode """"""""""""""" -The throughput mode for the file system to be created. -There are two throughput modes to choose from for your file system: bursting and provisioned. +Defines the Throughput Mode of the file system. -Valid Values are provisioned | bursting :: +Valid options are bursting and provisioned.:: throughput_mode = provisioned provisioned_throughput """""""""""""""""""""" -The throughput, measured in MiB/s, that you want to provision for a file system that you're creating. -The limit on throughput is 1024 MiB/s. You can get these limits increased by contacting AWS Support. +Defines the provisioned throughput measured in MiB/s. + +This parameter requires setting throughput_mode to provisioned. -Valid Range: Min of 0.0. To use this option, must specify throughput_mode to provisioned :: +The limit on throughput is 1024 MiB/s. Please contact AWS Support to request a limit increase. + +Valid Range: Min of 0.0.:: provisioned_throughput = 1024 efs_fs_id """"""""" -File system ID for an existing file system. Specifying this option will void all other EFS options but shared_dir. -Config sanity will only allow file systems that: have no mount target in the stack's availability zone -OR have existing mount target in stack's availability zone with inbound and outbound NFS traffic allowed from 0.0.0.0/0. +Defines the EFS file system ID for an existing file system. -Note: sanity check for validating efs_fs_id requires the IAM role to have permission for the following actions: -efs:DescribeMountTargets, efs:DescribeMountTargetSecurityGroups, ec2:DescribeSubnets, ec2:DescribeSecurityGroups. -Please add these permissions to your IAM role, or set `sanity_check = false` to avoid errors. +Specifying this option will void all other EFS options except for shared_dir. -CAUTION: having mount target with inbound and outbound NFS traffic allowed from 0.0.0.0/0 will expose the file system -to NFS mounting request from anywhere in the mount target's availability zone. We recommend not to have a mount target -in stack's availability zone and let us create the mount target. If you must have a mount target in stack's -availability zone, consider using a custom security group by providing a vpc_security_group_id option under the -vpc section, adding that security group to the mount target, and turning off config sanity to create the cluster. +config_sanity will only support file systems without a mount target in the stack's +availability zone *or* file systems that have an existing mount target in the stack's +availability zone with inbound and outbound NFS traffic allowed from 0.0.0.0/0. -Defaults to NONE. Needs to be an available EFS file system:: +The sanity check for validating efs_fs_id requires the IAM role to have the following permissions: - efs_fs_id = fs-12345 +efs:DescribeMountTargets +efs:DescribeMountTargetSecurityGroups +ec2:DescribeSubnets +ec2:DescribeSecurityGroups + +Please add these permissions to your IAM role or set `sanity_check = false` to avoid errors. + +CAUTION: Having mount target with inbound and outbound NFS traffic allowed from 0.0.0.0/0 +will expose the file system to NFS mounting request from anywhere in the mount target's +availability zone. AWS recommends *not* creating a mount target in the stack's availability +zone and letting us handle this step. If you must have a mount target in the stack's +availability zone, please consider using a custom security group by providing a vpc_security_group_id +option under the vpc section, adding that security group to the mount target, and turning +off config sanity to create the cluster. +Defaults to NONE.:: + + efs_fs_id = fs-12345 .. _raid_section: RAID ^^^^ -RAID drive configuration settings for creating a RAID array from a number of identical EBS volumes. The RAID drive -is mounted on the master node, and exported to compute nodes via nfs. :: - +Defines configuration settings for a RAID array built from a number of identical +EBS volumes. +The RAID drive is mounted on the master node and exported to compute nodes via NFS. :: [raid rs] shared_dir = raid @@ -828,26 +902,38 @@ is mounted on the master node, and exported to compute nodes via nfs. :: shared_dir """""""""" -Shared directory that the RAID drive will be mounted to on the master and compute nodes. +Defines the mount point for the RAID array on the master and compute nodes. -This parameter is REQUIRED, the RAID drive will only be created if this parameter is specified. -The below example mounts to /raid. Do not use NONE or /NONE as the shared directory.:: +The RAID drive will only be created if this parameter is specified. + +The example below will mount the array at /raid. + +Do not use NONE or /NONE as the shared directory.:: shared_dir = raid raid_type """"""""" -RAID type for the RAID array. Currently only support RAID 0 or RAID 1. For more information on RAID types, -see: `RAID info `_ +Defines the RAID type for the RAID array. + +Valid options are RAID 0 or RAID 1. + +For more information on RAID types, see: `RAID info +`_ -This parameter is REQUIRED, the RAID drive will only be created if this parameter is specified. -The below example will create a RAID 0 array:: +The RAID drive will only be created if this parameter is specified. + +The example below will create a RAID 0 array:: raid_type = 0 num_of_raid_volumes """"""""""""""""""" -The number of EBS volumes to assemble the RAID array from. Currently supports max of 5 volumes and minimum of 2. +Defines the number of EBS volumes to assemble the RAID array from. + +Minimum number of volumes = 2. + +Maximum number of volumes = 5. Defaults to 2. :: @@ -855,8 +941,9 @@ Defaults to 2. :: volume_type """"""""""" -The the type of volume you wish to launch. -See: `Volume type `_ for detail +Defines the type of volume to build. + +See: `Volume type `_ for more detail. Defaults to gp2. :: @@ -864,7 +951,7 @@ Defaults to gp2. :: volume_size """"""""""" -Size of volume to be created. +Defines the size of volume to be created. Defaults to 20GB. :: @@ -872,15 +959,131 @@ Defaults to 20GB. :: volume_iops """"""""""" -Number of IOPS for io1 type volumes. :: +Defines the number of IOPS for io1 type volumes. :: volume_iops = 500 encrypted """"""""" -Whether or not the file system will be encrypted. +Determines if the file system will be encrypted. Defaults to false. :: encrypted = false +ebs_kms_key_id +"""""""""""""" +Use a custom KMS Key for encryption. + +This must be used in conjunction with ``encrypted = true`` and must have a custom ``ec2_iam_role``. + +See :ref:`Disk Encryption with a Custom KMS Key `. :: + + ebs_kms_key_id = xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + + +.. _fsx_section: + +FSx +^^^ +Configuration for an attached FSx Lustre file system. See `FSx CreateFileSystem +`_ for more information. + +Use an existing FSx file system by specifying ``fsx_fs_id``. :: + + [fsx fs] + shared_dir = /fsx + fsx_fs_id = fs-073c3803dca3e28a6 + +Or create and configure a new file system, with the following parameters :: + + [fsx fs] + shared_dir = /fsx + storage_capacity = 3600 + fsx_kms_key_id = 9e8a129c-0e85-459-865b-3a5be974a22b + imported_file_chunk_size = 1024 + export_path = s3://bucket/folder + import_path = s3://bucket + weekly_maintenance_start_time = 1:00:00 + +shared_dir +"""""""""" +**Required** Defines the mount point for the Lustre File system on the master and compute nodes. + +The example below will mount the filesystem at /fsx. + +Do not use NONE or /NONE as the shared directory.:: + + shared_dir = /fsx + +fsx_fs_id +""""""""" +**Optional** Attach an existing FSx File System. + +If this option is specified, all following FSx parameters, such as ``storage_capacity`` are ignored. :: + + fsx_fs_id = fs-073c3803dca3e28a6 + +storage_capacity +"""""""""""""""" +**Optional** The storage capacity of the file system in GiB. + +The storage capacity has a minimum of 3,600 GiB and is provisioned in increments of 3,600 GiB. + +Defaults to 3,600 GiB. :: + + storage_capacity = 3600 + +fsx_kms_key_id +"""""""""""""" +**Optional** The ID of your AWS Key Management Service (AWS KMS) key. + +This ID is used to encrypt the data in your file system at rest. + +This must be used with a custom ``ec2_iam_role``. See +:ref:`Disk Encryption with a Custom KMS Key `. :: + + fsx_kms_key_id = xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + +imported_file_chunk_size +"""""""""""""""""""""""" +**Optional** For files imported from a data repository (using ``import_path``), this value determines the stripe count +and maximum amount of data per file (in MiB) stored on a single physical disk. The maximum number of disks that a single +file can be striped across is limited by the total number of disks that make up the file system. + +The chunk size default is 1,024 MiB (1 GiB) and can go as high as 512,000 MiB (500 GiB). +Amazon S3 objects have a maximum size of 5 TB. :: + + imported_file_chunk_size = 1024 + +export_path +""""""""""" + +**Optional** The S3 path where the root of your file system is exported. The path **must** be in the same S3 bucket as +the ``import_path`` parameter. + +Defaults to ``s3://import-bucket/FSxLustre[creation-timestamp]`` where ``import-bucket`` is the bucket provided in +``import_path`` parameter. :: + + export_path = s3://bucket/folder + +import_path +""""""""""" +***Optional** S3 Bucket to load data from into the file system. Also serves as the export bucket. See ``export_path``. + +Import occurs on cluster creation, see `Importing Data from your Amazon S3 Bucket +`_ + +If not provided, file system will be empty. :: + + import_path = s3://bucket + +weekly_maintenance_start_time +""""""""""""""""""""""""""""" +***Optional** Preferred time to perform weekly maintenance, in UTC time zone. + +Format is [day of week]:[hour of day]:[minute of hour]. For example, Monday at Midnight is: :: + + weekly_maintenance_start_time = 1:00:00 + + diff --git a/docs/iam.rst b/docs/iam.rst index 24dad0716e..3e66a0e32f 100644 --- a/docs/iam.rst +++ b/docs/iam.rst @@ -420,6 +420,14 @@ In case you are using SGE, Slurm or Torque as a scheduler: ], "Effect": "Allow", "Resource": "*" + }, + { + "Sid": "SSMDescribe", + "Action": [ + "ssm:GetParametersByPath" + ], + "Effect": "Allow", + "Resource": "*" } ] } diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 2310244164..6e068f4c02 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -58,3 +58,7 @@ DescribeMountTargetSecurityGroups DescribeSubnets DescribeSecurityGroups num +FSx +fsx +Lustre +GiB diff --git a/docs/tutorials/04_encrypted_kms_fs.rst b/docs/tutorials/04_encrypted_kms_fs.rst new file mode 100644 index 0000000000..e0595b0dce --- /dev/null +++ b/docs/tutorials/04_encrypted_kms_fs.rst @@ -0,0 +1,76 @@ +.. _tutorials_encrypted_kms_fs: + +.. toctree:: + :maxdepth: 2 + +##################################### +Disk Encryption with a Custom KMS Key +##################################### + +AWS ParallelCluster supports the configuration options ``ebs_kms_key_id`` and ``fsx_kms_key_id``, which allow you to +provide a custom KMS key for EBS Disk encryption or FSx Lustre. To use them you'll need to specify a ``ec2_iam_role``. + +In order for the cluster to create, the KMS key needs to know the name of the cluster's role. This prevents you from +using the role created on cluster create, requiring a custom ``ec2_iam_role``. + + +Creating the Role +================= + +First you'll need to create a policy: + +1. Go to the IAM Console: https://console.aws.amazon.com/iam/home +2. Under Policies, create a policy, click the JSON tab +3. As the policy's body, paste in the :doc:`Instance Policy<../iam>` + Make sure to replace all occurrences of ```` and ```` +4. Call it ``ParallelClusterInstancePolicy`` and click "Create Policy" + +Next create a role: + +1. Under Roles, create a role +2. Click ``EC2`` as the trusted entity +3. Under Permissions, search for the ``ParallelClusterInstancePolicy`` role you just created and attach it. +4. Name it ``ParallelClusterRole`` and click "Create Role" + +Give your Key Permissions +========================= + +In the IAM Console > Encryption Keys > click on your key. + +Click "Add User" and search for the `ParallelClusterInstanceRole`` you just created. Attach it. + +Creating the Cluster +==================== + +Now create a cluster, here's an example of a cluster with encrypted ``Raid 0`` drives: :: + + [cluster default] + ... + raid_settings = rs + ec2_iam_role = ParallelClusterInstanceRole + + [raid rs] + shared_dir = raid + raid_type = 0 + num_of_raid_volumes = 2 + volume_size = 100 + encrypted = true + ebs_kms_key_id = xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + +Here's an example with FSx Lustre file system: :: + + [cluster default] + ... + fsx_settings = fs + ec2_iam_role = ParallelClusterInstanceRole + + [fsx fs] + shared_dir = /fsx + storage_capacity = 3600 + imported_file_chunk_size = 1024 + export_path = s3://bucket/folder + import_path = s3://bucket + weekly_maintenance_start_time = 1:00:00 + fsx_kms_key_id = xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + +Similar configuration applies for EBS and FSx based file systems. diff --git a/tests/cluster-check.sh b/tests/cluster-check.sh index a9343e7231..58c4b97108 100755 --- a/tests/cluster-check.sh +++ b/tests/cluster-check.sh @@ -83,17 +83,10 @@ submit_launch() { submit_init ${scheduler} - ${scheduler}_submit + echo "$(date +%s)" > jobs_start_time - done=0 - while test $done = 0 ; do - if test -f job1.done -a -f job2.done -a -f job3.done; then - done=1 - else - sleep 1 - fi - done - echo "Scaleup successful" + ${scheduler}_submit + echo "Jobs submitted successfully" } submit_init() { @@ -123,18 +116,18 @@ slurm_submit() { cat > job1.sh < job1.done EOF cat > job2.sh < job2.done EOF cat > job3.sh < job3.done EOF chmod +x job1.sh job2.sh job3.sh @@ -154,7 +147,7 @@ sge_submit() { #$ -R y sleep ${_sleepjob1} -touch job1.done +echo "\$(date +%s)" > job1.done EOF cat > job2.sh < job2.done EOF cat > job3.sh < job3.done EOF chmod +x job1.sh job2.sh job3.sh @@ -186,18 +179,18 @@ torque_submit() { cat > job1.sh < job1.done EOF cat > job2.sh < job2.done EOF cat > job3.sh < job3.done EOF chmod +x job1.sh job2.sh job3.sh diff --git a/tests/ebs_volume_test.py b/tests/ebs_volume_test.py index be043e80ce..d8d0009f42 100644 --- a/tests/ebs_volume_test.py +++ b/tests/ebs_volume_test.py @@ -1,4 +1,3 @@ -import argparse import datetime import os import Queue @@ -12,7 +11,9 @@ import time from builtins import exit +import argparse import boto3 + import process_helper as prochelp UNSUPPORTED_REGIONS = set(["ap-northeast-3", "eu-west-3"]) diff --git a/tests/efs-test.py b/tests/efs-test.py index 6e0f122657..e39c5d26f6 100644 --- a/tests/efs-test.py +++ b/tests/efs-test.py @@ -1,4 +1,3 @@ -import argparse import datetime import os import Queue @@ -10,7 +9,9 @@ import time from builtins import exit +import argparse import boto3 + import process_helper as prochelp UNSUPPORTED_REGIONS = set(["ap-northeast-3", "eu-west-3"]) diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md new file mode 100644 index 0000000000..c16013c30f --- /dev/null +++ b/tests/integration-tests/README.md @@ -0,0 +1,472 @@ +# AWS ParallelCluster Integration Testing Framework + +The framework used to implement and run integration tests for AWS ParallelCluster is made of two main components: +* **Integration Tests Orchestrator**: is a cli that allows to submit the integration tests. It takes care of setting +up the test environment, it orchestrates parallel tests execution and generates the final tests reports. +* **Integration Tests Framework**: the actual testing framework, based on pytest, which defines a series of +fixtures and markers that allows to parametrize tests execution across several dimensions, to easily manage clusters +lifecycle and re-usage and to perform cleanup on failures. It also offers a set of utility functions +that implement features which are common to all tests such as remote command execution and ParallelCluster +config generation. + +## Run Integration Tests + +Before executing integration tests it is required to install all the python dependencies required by the framework. +In order to do that simply run the following command: +```bash +pip install -r tests/integration-tests/requirements.txt +``` + +Once this is done you can look at the helper of the orchestrator cli in order to list all the available options: +```bash +cd tests/integration-tests +python -m test_runner -h +``` + +Here is an example of tests submission: +```bash +python -m test_runner \ + --key-name "ec2-key-name" \ + --key-path "~/.ssh/ec2-key.pem" \ + --regions "eu-west-1" "us-east-1" \ + --instances "c4.xlarge" "c5.xlarge" \ + --oss "alinux" "centos7" \ + --schedulers "awsbatch" "sge" \ + --parallelism 8 \ + --retry-on-failures \ + --reports html junitxml json +``` + +Executing the command will run an integration testing suite with the following features: +* "ec2-key-name" is used to configure EC2 keys +* "~/.ssh/ec2-key.pem" is used to ssh into cluster instances and corresponds to the EC2 key ec2-key-name +* tests are executed in all combinations of (region, instance, os, scheduler) where each dimension is expanded +with the specified values. +* tests are executed in parallel in all regions and for each region 8 tests are executed concurrently +* in case of failures the failed tests are retried once more after a delay of 60 seconds +* tests reports are generated in html, junitxml and json formats + +### Tests Outputs & Reports + +The following options can be used to control test outputs and reports: +* `--output-dir path/to/dir`: specifies the base dir where test outputs and logs will be saved. +Defaults to tests_outputs. +* `--reports {html,junitxml,json}`: allows to select what tests reports to generate. +* `--show-output`: when specified does not redirect stdout to file. Useful when developing the tests but not +recommended when running parallel tests. + +Here is what files are produced by the tests and where these files are stored when running with default `output-dir` +and `--reports html junitxml json`: +``` +tests_outputs +├── $timestamp.logs: directory containing log files +│   ├── $region_i.log: log outputs for a single region +│   └── ... +└── $timestamp.out: directory containing tests reports + ├── $region_i: directory containing tests reports for a single region + │   ├── clusters_configs: directory storing all cluster configs used by test + │   │   ├── test_awsbatch.py::test_job_submission[c5.xlarge-eu-west-1-alinux-awsbatch].config + │ │   └── ... + │   ├── pytest.out: stdout of pytest for the given region + │   ├── results.html: html report for the given region + │   └── results.xml: junitxml report for the given region + ├── test_report.json: global json report + └── test_report.xml: global junitxml report +``` + +If tests are ran sequentially by adding the `--sequential` option the result is the following: +``` +tests_outputs +├── $timestamp..logs +│   └── all_regions.log: log outputs for all regions +└── $timestamp..out + ├── clusters_configs: directory storing all cluster configs used by test + │   ├── test_playground.py::test_factory.config + │ └── ... + ├── pytest.out: global pytest stdout + ├── results.html: global html report + ├── results.xml: same as test_report.xml + ├── test_report.json: global json report + └── test_report.xml: global junitxml report +``` + +### Specify Tests Dimensions +The following options can be used to control the parametrization of test cases: +* `-r REGIONS [REGIONS ...], --regions REGIONS [REGIONS ...]`: AWS region where tests are executed. +* `-i INSTANCES [INSTANCES ...], --instances INSTANCES [INSTANCES ...]`: AWS instances under test. +* `-o OSS [OSS ...], --oss OSS [OSS ...]`: OSs under test. +* `-s SCHEDULERS [SCHEDULERS ...], --schedulers SCHEDULERS [SCHEDULERS ...]`: Schedulers under test. + +Note that each test case can specify a subset of dimensions it is allowed to run against (for example +a test case written specifically for the awsbatch scheduler should only be executed against the awsbatch scheduler). +This means that the final parametrization of the tests is given by an intersection of the input dimensions and +the tests specific dimensions so that all constraints are verified. + +### Parallelize Tests Execution +The following options can be used to control tests parallelism: +* `--sequential`: by default the tests orchestrator executes a separate parallel process for each region under test. +By specifying this option all tests are executed sequentially in a single process. +* `-n PARALLELISM, --parallelism PARALLELISM`: allows to specify the degree of parallelism for each process. It is +useful to limit the number of clusters that are created concurrently in a specific region so that AWS account limits +can be guaranteed. + +### Retry On Failures +When passing the `--retry-on-failures` flag failed tests are retried once more after a delay of 60 seconds. + +### Run Tests For Specific Features +The `-f FEATURES [FEATURES ...], --features FEATURES [FEATURES ...]` option allows to limit the number of test +cases to execute by only running those that are meant to verify a specific feature or subset of features. + +To execute a subset of features simply pass with the `-f` option a list of markers that identify the test cases +to run. For example when passing `-f "awsbatch" "not advanced"` all test cases marked with `@pytest.mark.awsbatch` and +not marked with `@pytest.mark.advanced` are executed. + +It is a good practice to mark test cases with a series of markers that allow to identify the feature under test. +Every test is marked by default with a marker matching its filename with the `test_` prefix or `_test` suffix removed. + +### Custom Templates And Packages + +To use custom templates or packages URLs the following options are available: +* `--custom-node-url`: URL to a custom node package. +* `--custom-cookbook-url`: URL to a custom cookbook package. +* `--custom-template-url`: URL to a custom cfn template. +* `--custom-awsbatch-template-url`: URL to a custom awsbatch cfn template. +* `--custom-awsbatchcli-url`: URL to a custom awsbatch cli package. + +The configuration for the custom templates and packages are automatically injected into +all cluster configs when these are rendered. In case any of these parameters is already set +in the cluster config then the value in the config is used. + +## Write Integration Tests + +All integration tests are defined in the `integration-tests/tests` directory. + +When executing the test_runner, tests are automatically discovered by following the default pytest discovery rules: +* search for `test_*.py` or `*_test.py` files, imported by their test package name. +* from those files, collect test items: + * `test_` prefixed test functions or methods outside of class + * `test_` prefixed test functions or methods inside `Test` prefixed test classes (without an `__init__` method) + +Test cases are organized in separate files where the file name is `test_$feature_under_test`. For example test +cases specific to awsbatch scheduler can be defined in a file named `test_awsbatch.py`. +If a single feature contains several tests it is possible to split them across several files and group them in a +common directory. Directories can be used also to group test belonging to the same category. For instance all tests +related to storage options could be grouped in the following fashion: +``` +tests_outputs +└── tests +    └── storage +    ├── test_ebs.py +    ├── test_raid.py + └── test_efs.py +``` + +*The testing framework is heavily based on [pytest](https://docs.pytest.org/en/latest/contents.html) and it makes use of +some specific pytest concepts such as [fixtures](https://doc.pytest.org/en/latest/fixture.html). To better understand +the implementation details behind the testing framework, it is highly recommended to have a quick look at the basic +pytest key concepts first. This is not required in case you only want to add new test cases and not modify the framework +itself.* + +### Define Parametrized Test Cases + +Here is how to define a simple parametrized test case: +```python +def test_case_1(region, instance, os, scheduler): +``` +This test case will be automatically parametrized and executed for all combination of input dimensions. +For example, given as input dimensions `--regions "eu-west-1" --instances "c4.xlarge" --oss "alinux" +"ubuntu1604" --scheduler "sge" "slurm"`, the following tests will run: +``` +test_case_1[eu-west-1-c4.xlarge-alinux-sge] +test_case_1[eu-west-1-c4.xlarge-ubuntu1604-sge] +test_case_1[eu-west-1-c4.xlarge-alinux-slurm] +test_case_1[eu-west-1-c4.xlarge-ubuntu1604-slurm] +``` + +If you don't need to reference the parametrized arguments in your test case you can simply replace the +function arguments with this annotation: `@pytest.mark.usefixtures("region", "os", "instance", "scheduler")` + +```python +@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.instances(["c5.xlarge", "t2.large"]) +@pytest.mark.dimensions("*", "*", "alinux", "awsbatch") +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +def test_case_2(): +``` + +If you want to add another level of parametrization which only applies to a single or a subset of +test cases then you can do it in the following way: + +```python +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +@pytest.mark.parametrized("cluster_max_size", [5, 10]) +def test_case_2(cluster_max_size): +``` + +### Restrict Test Cases Dimensions + +It is possible to restrict the dimensions each test is compatible with by using some custom markers. +The available markers are the following: +```python +@pytest.mark.instances(instances_list): run test only against the listed instances +@pytest.mark.regions(regions_list): run test only against the listed regions +@pytest.mark.oss(os_list): run test only against the listed oss +@pytest.mark.schedulers(schedulers_list): run test only against the listed schedulers +@pytest.mark.dimensions(region, instance, os, scheduler): run test only against the listed dimensions +@pytest.mark.skip_instances(instances_list): skip test for the listed instances +@pytest.mark.skip_regions(regions_list): skip test for the listed regions +@pytest.mark.skip_oss(os_list): skip test for the listed oss +@pytest.mark.skip_schedulers(schedulers_list): skip test for the listed schedulers +@pytest.mark.skip_dimensions(region, instance, os, scheduler): skip test for the listed dimensions +``` + +For example, given the following test definition: +```python +@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.instances(["c5.xlarge", "t2.large"]) +@pytest.mark.dimensions("*", "*", "alinux", "awsbatch") +def test_case_1(region, instance, os, scheduler): +``` +The test is allowed to run against the following subset of dimensions: +* region has to be one of `["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]` +* instance has to be one of `"c5.xlarge", "t2.large"` +* os has to be `alinux` +* scheduler has to be `awsbatch` + +While the following test case: +```python +@pytest.mark.skip_regions(["us-east-1", "eu-west-1"]) +@pytest.mark.skip_dimensions("*", "c5.xlarge", "alinux", "awsbatch") +@pytest.mark.skip_dimensions("*", "c4.xlarge", "centos6", "sge") +def test_case_2(region, instance, os, scheduler): +``` +is allowed to run only if: +* region is not `["us-east-1", "eu-west-1"]` +* the triplet (instance, os, scheduler) is not `("c5.xlarge", "alinux", "awsbatch")` or +`("c4.xlarge", "centos6", "sge")` + +#### Default Invalid Dimensions + +It is possible that some combination of dimensions are not allowed because for example a specific instance is not +available in a given AWS region. + +To define such exceptions it is possible to extend the list `UNSUPPORTED_DIMENSIONS` in conftest_markers.py file. +By default all tuples specified in that list will be added as `skip_dimensions` marker to all tests. + +### Manage Tests Data + +Tests data and resources are organized in the following directories: +``` +integration-tests +└── tests +   ├── $test_file_i.py: contains resources for test cases defined in file $test_file_i.py +   │   └── $test_case_i: contains resources for test case $test_case_i +   │   ├── data_file +   │   ├── pcluster.config.ini +   │   └── test_script.sh +   └── data: contains common resources to share across all tests +     └── shared_dir_1 +      └── shared_file_1 +``` + +[pytest-datadir](https://github.com/gabrielcnr/pytest-datadir) is a pytest plugin that is used for manipulating test +data directories and files. + +A fixture `test_datadir` is built on top of it and can be used to the inject the `datadir` with resources for the +specific test function. + +For example in the following test, defined in the file `test_feature.py`: +```python +def test_case_1(region, instance, os, scheduler, test_datadir): +``` +the argument `test_datadir` is initialized at each test run with the a path to a temporary directory that contains +a copy of the contents of `integration-tests/tests/test_feature/test_case_1`. +This way the test case can freely modify the contents of that dir at each run without compromising other tests +executions. + +The fixture `shared_datadir` can be used similarly to access the shared resources directory. + +### Parametrized Clusters Configurations + +Similarly to parametrized test cases, also cluster configurations can be parametrized or even better written with +[Jinja2](http://jinja.pocoo.org/docs/2.10/) templating syntax. + +The cluster configuration needed for a given test case needs to reside in the test specific `test_datadir` +and it needs to be in a file named pcluster.config.ini. + +Test cases can then inject a fixture called `pcluster_config_reader` which allows to automatically read and render +the configuration defined for a specific test case and have it automatically parametrized with the default +test dimensions and additional test options (such as the value assigned to `key_name`). + +For example in the following test, defined in the file `test_feature.py`: +```python +def test_case_1(region, instance, os, scheduler, pcluster_config_reader): + cluster_config = pcluster_config_reader(vpc_id="id-xxx", master_subnet_id="id-xxx", compute_subnet_id="id-xxx") +``` +you can simply render the parametrized cluster config which is defined in the file +`integration-tests/tests/test_feature/test_case_1/pcluster.config.ini` + +Here is an example of the parametrized pcluster config: +```INI +[global] +cluster_template = awsbatch + +[aws] +aws_region_name = {{ region }} + +[cluster awsbatch] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = awsbatch +compute_instance_type = {{ instance }} +min_vcpus = 2 +desired_vcpus = 2 +max_vcpus = 24 + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +``` + +The following placeholders are automatically injected by the `pcluster_config_reader` fixture and are +available in the `pcluster.config.ini` files: +* Test dimensions for the specific parametrized test case: `{{ region }}`, `{{ instance }}`, `{{ os }}`, +`{{ scheduler }}` +* EC2 key name specified at tests submission time by the user: `{{ key_name }}` +* VPC related parameters: `{{ vpc_id }}`, `{{ public_subnet_id }}`, `{{ private_subnet_id }}` + +Additional parameters can be specified when calling the fixture to retrieve the rendered configuration +as shown in the example above. + +### VPC Configuration + +A VPC and the related subnets are automatically configured at the start of the integration tests for each region under +test. These resources are shared across all the tests and deleted when all tests are completed. + +The idea is to create a single VPC per region and have multiple subnets that allow to test different networking setups. +At the moment two subnets are generated (a private one and a public one) with the current configuration: + +```python +public_subnet = SubnetConfig( + name="PublicSubnet", + cidr="10.0.0.0/24", + map_public_ip_on_launch=True, + has_nat_gateway=True, + default_gateway=Gateways.INTERNET_GATEWAY, +) +private_subnet = SubnetConfig( + name="PrivateSubnet", + cidr="10.0.1.0/24", + map_public_ip_on_launch=False, + has_nat_gateway=False, + default_gateway=Gateways.NAT_GATEWAY, +) +vpc = VPCConfig( + name="vpc", + cidr="10.0.0.0/16", + enable_dns_support=True, + enable_dns_hostnames=True, + has_internet_gateway=True, + subnets = [private_subnet, public_subnet], +) +``` + +Behind the scenes a CloudFormation template is dynamically generated by the `VPCTemplateBuilder` +(leveraging a tool called [Troposphere](https://github.com/cloudtools/troposphere)) and a VPC is created in each region +under test by the `vpc_stacks` autouse session fixture. + +Parameters related to the generated VPC and Subnets are automatically exported to the Jinja template engine and +in particular are available when using the `pcluster_config_reader` fixture, as shown above. The only thing to do +is to use them when defining the cluster config for the specific test case: + +```INI +... +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +``` + +### Create/Destroy Clusters + +Cluster lifecycle management is fully managed by the testing framework and is exposed through the fixture +`clusters_factory`. + +Here is an example of how to use it: +```python +def test_case_1(region, instance, os, scheduler, pcluster_config_reader, clusters_factory): + cluster_config = pcluster_config_reader(vpc_id="aaa", master_subnet_id="bbb", compute_subnet_id="ccc") + cluster = clusters_factory(cluster_config) +``` + +The factory can be used as shown above to create one or multiple clusters that will be automatically +destroyed when the test completes or in case of unexpected errors. + +`cluster_factory` fixture also takes care of dumping a copy of the configuration used to create each cluster +in the tests output directory. + +The object returned by clusters_factory is a `Cluster` instance that contains all the necessary cluster information, +included the CloudFormation stack outputs. + +### Execute Remote Commands + +To execute remote commands or scripts on the Master instance of the cluster under test, the `RemoteCommandExecutor` +class can be used. It simply requires a valid `Cluster` object to be initialized and it offers some utility +methods to execute remote commands and scripts as shown in the example below: + +```python +import logging +from remote_command_executor import RemoteCommandExecutor +def test_case_1(region, instance, os, scheduler, pcluster_config_reader, clusters_factory, test_datadir): + cluster_config = pcluster_config_reader(vpc_id="aaa", master_subnet_id="bbb", compute_subnet_id="ccc") + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + result = remote_command_executor.run_remote_command("env") + logging.info(result.stdout) + result = remote_command_executor.run_remote_command(["echo", "test"]) + logging.info(result.stdout) + result = remote_command_executor.run_remote_script( + str(test_datadir / "test_script.sh"), args=["1", "2"], additional_files=[str(test_datadir / "data_file")] + ) + logging.info(result.stdout) +``` + +and here is the structure of the datadir if the test case is defined in the `test_feature.py` file: +``` +integration-tests +└── tests +    └── test_feature +       └── test_case_1 +       ├── data_file +       ├── pcluster.config.ini +       └── test_script.sh + +``` + +### Logging + +A default logger is configured to write both to the stdout and to the log file dedicated to the specific test +process. When running in `--sequential` mode a single log file is created otherwise a +separate logfile is generated for each region. + +### Create CloudFormation Templates + +If additional AWS resources are needed by the integration tests you can use a session scoped fixture, +`cfn_stacks_factory`, which takes care of automatically manage creation and deletion of CFN stacks that live +for the entire duration of the tests. Deletion of all stacks is automatically performed when all tests +are completed. If you want to reduce the lifetime of a specific resource you can either create a separate similar +fixture with a reduced scope or you can directly use the CfnStacksFactory object (note: fixtures are always better to +handle resources cleanup.) + +An example is given by this piece of code that handles the creation of a test VPC: +```python +@pytest.fixture(autouse=True) +def vpc(cfn_stacks_factory): + # ... lines removed + template = VPCTemplateBuilder(vpc_config).build() + stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) + cfn_stacks_factory.create_stack(stack) + return stack +``` diff --git a/tests/integration-tests/__init__.py b/tests/integration-tests/__init__.py new file mode 100644 index 0000000000..221b7a2eca --- /dev/null +++ b/tests/integration-tests/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py new file mode 100644 index 0000000000..b8f9370c36 --- /dev/null +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -0,0 +1,138 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import logging + +import boto3 +from botocore.exceptions import ClientError +from retrying import retry + +from utils import retrieve_cfn_outputs + + +class CfnStack: + """Identify a CloudFormation stack.""" + + def __init__(self, name, region, template): + self.name = name + self.region = region + self.template = template + self.cfn_stack_id = None + self.__cfn_outputs = None + + @property + def cfn_outputs(self): + """ + Return the CloudFormation stack outputs for the stack. + Outputs are retrieved only once and then cached. + """ + if not self.__cfn_outputs: + self.__cfn_outputs = retrieve_cfn_outputs(self.name, self.region) + return self.__cfn_outputs + + +class CfnStacksFactory: + """Manage creation and deletion of CloudFormation stacks.""" + + def __init__(self): + self.__created_stacks = {} + + def create_stack(self, stack): + """ + Create a cfn stack with a given template. + :param stack: stack to create. + :return: + """ + name = stack.name + region = stack.region + id = self.__get_stack_internal_id(name, region) + if id in self.__created_stacks: + raise ValueError("Stack {0} already exists in region {1}".format(name, region)) + + # create the cluster + logging.info("Creating stack {0} in region {1}".format(name, region)) + self.__created_stacks[id] = stack + try: + cfn_client = boto3.client("cloudformation", region_name=region) + result = cfn_client.create_stack(StackName=name, TemplateBody=stack.template) + stack.cfn_stack_id = result["StackId"] + final_status = self.__wait_for_stack_creation(stack.cfn_stack_id, cfn_client) + self.__assert_stack_status(final_status, "CREATE_COMPLETE") + except Exception as e: + logging.error("Creation of stack {0} in region {1} failed with exception: {2}".format(name, region, e)) + raise + + logging.info("Cluster {0} created successfully in region {1}".format(name, region)) + + @retry( + stop_max_attempt_number=10, + wait_fixed=5000, + retry_on_exception=lambda exception: isinstance(exception, ClientError), + ) + def delete_stack(self, name, region): + """Destroy a created cfn stack.""" + id = self.__get_stack_internal_id(name, region) + if id in self.__created_stacks: + logging.info("Destroying stack {0} in region {1}".format(name, region)) + try: + stack = self.__created_stacks[id] + cfn_client = boto3.client("cloudformation", region_name=stack.region) + cfn_client.delete_stack(StackName=stack.name) + final_status = self.__wait_for_stack_deletion(stack.cfn_stack_id, cfn_client) + self.__assert_stack_status(final_status, "DELETE_COMPLETE") + except Exception as e: + logging.error("Deletion of stack {0} in region {1} failed with exception: {2}".format(name, region, e)) + raise + del self.__created_stacks[id] + logging.info("Cluster {0} deleted successfully in region {1}".format(name, region)) + else: + logging.warning("Couldn't find cluster with name {0} in region. Skipping deletion.".format(name, region)) + + def delete_all_stacks(self): + """Destroy all created stacks.""" + logging.debug("Destroying all cfn stacks") + for _, value in dict(self.__created_stacks).items(): + try: + self.delete_stack(value.name, value.region) + except Exception as e: + logging.error( + "Failed when destroying stack {0} in region {1} with exception {2}.".format( + value.name, value.region, e + ) + ) + + @retry( + retry_on_result=lambda result: result == "CREATE_IN_PROGRESS", + wait_fixed=5000, + retry_on_exception=lambda e: False, + ) + def __wait_for_stack_creation(self, name, cfn_client): + return self.__get_stack_status(name, cfn_client) + + @retry( + retry_on_result=lambda result: result == "DELETE_IN_PROGRESS", + wait_fixed=5000, + retry_on_exception=lambda e: False, + ) + def __wait_for_stack_deletion(self, name, cfn_client): + return self.__get_stack_status(name, cfn_client) + + @staticmethod + def __get_stack_status(name, cfn_client): + return cfn_client.describe_stacks(StackName=name).get("Stacks")[0].get("StackStatus") + + @staticmethod + def __assert_stack_status(status, expected_status): + if status != expected_status: + raise Exception("Stack status {0} differs from expected one {1}".format(status, expected_status)) + + @staticmethod + def __get_stack_internal_id(name, region): + return name + "-" + region diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py new file mode 100644 index 0000000000..ff7fed1f13 --- /dev/null +++ b/tests/integration-tests/clusters_factory.py @@ -0,0 +1,114 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import configparser +from retrying import retry + +from utils import retrieve_cfn_outputs, retry_if_subprocess_error, run_command + + +class Cluster: + """Contain all static and dynamic data related to a cluster instance.""" + + def __init__(self, name, config_file, ssh_key): + self.name = name + self.config_file = config_file + self.ssh_key = ssh_key + self.config = configparser.ConfigParser() + self.config.read(config_file) + self.__cfn_outputs = None + + @property + def cfn_name(self): + """Return the name of the CloudFormation stack associated to the cluster.""" + return "parallelcluster-" + self.name + + @property + def region(self): + """Return the aws region the cluster is created in.""" + return self.config.get("aws", "aws_region_name", fallback="us-east-1") + + @property + def master_ip(self): + """Return the public ip of the cluster master node.""" + return self.cfn_outputs["MasterPublicIP"] + + @property + def os(self): + """Return the os used for the cluster.""" + cluster_template = self.config.get("global", "cluster_template", fallback="default") + return self.config.get("cluster {0}".format(cluster_template), "base_os", fallback="alinux") + + @property + def cfn_outputs(self): + """ + Return the CloudFormation stack outputs for the cluster. + Outputs are retrieved only once and then cached. + """ + if not self.__cfn_outputs: + self.__cfn_outputs = retrieve_cfn_outputs(self.cfn_name, self.region) + return self.__cfn_outputs + + +class ClustersFactory: + """Manage creation and destruction of pcluster clusters.""" + + def __init__(self): + self.__created_clusters = {} + + def create_cluster(self, cluster): + """ + Create a cluster with a given config. + :param cluster: cluster to create. + """ + name = cluster.name + config = cluster.config_file + if name in self.__created_clusters: + raise ValueError("Cluster {0} already exists".format(name)) + + # create the cluster + logging.info("Creating cluster {0} with config {1}".format(name, config)) + self.__created_clusters[name] = cluster + result = run_command(["pcluster", "create", "--config", config, name]) + if "Status: {0} - CREATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: + error = "Cluster creation failed for {0} with output: {1}".format(name, result.stdout) + logging.error(error) + raise Exception(error) + logging.info("Cluster {0} created successfully".format(name)) + + @retry(stop_max_attempt_number=10, wait_fixed=5000, retry_on_exception=retry_if_subprocess_error) + def destroy_cluster(self, name): + """Destroy a created cluster.""" + logging.info("Destroying cluster {0}".format(name)) + if name in self.__created_clusters: + cluster = self.__created_clusters[name] + + # destroy the cluster + result = run_command(["pcluster", "delete", "--config", cluster.config_file, name]) + if "DELETE_FAILED" in result.stdout: + error = "Cluster deletion failed for {0} with output: {1}".format(name, result.stdout) + logging.error(error) + raise Exception(error) + del self.__created_clusters[name] + logging.info("Cluster {0} deleted successfully".format(name)) + else: + logging.warning("Couldn't find cluster with name {0}. Skipping deletion.".format(name)) + + def destroy_all_clusters(self): + """Destroy all created clusters.""" + logging.debug("Destroying all clusters") + for key in list(self.__created_clusters.keys()): + try: + self.destroy_cluster(key) + except Exception as e: + logging.error("Failed when destroying cluster {0} with exception {1}.".format(key, e)) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py new file mode 100644 index 0000000000..6a15fbb047 --- /dev/null +++ b/tests/integration-tests/conftest.py @@ -0,0 +1,324 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +# This file has a special meaning for pytest. See https://docs.pytest.org/en/2.7.3/plugins.html for +# additional details. + +import json +import logging +import os +import re +from shutil import copyfile + +import configparser +import pytest + +from cfn_stacks_factory import CfnStack, CfnStacksFactory +from clusters_factory import Cluster, ClustersFactory +from conftest_markers import ( + DIMENSIONS_MARKER_ARGS, + add_default_markers, + check_marker_dimensions, + check_marker_list, + check_marker_skip_dimensions, + check_marker_skip_list, +) +from jinja2 import Environment, FileSystemLoader +from utils import create_s3_bucket, delete_s3_bucket, random_alphanumeric, to_snake_case +from vpc_builder import Gateways, SubnetConfig, VPCConfig, VPCTemplateBuilder + + +def pytest_addoption(parser): + """Register argparse-style options and ini-style config values, called once at the beginning of a test run.""" + parser.addoption("--regions", help="aws region where tests are executed", default=["us-east-1"], nargs="+") + parser.addoption("--instances", help="aws instances under test", default=["c5.xlarge"], nargs="+") + parser.addoption("--oss", help="OSs under test", default=["alinux"], nargs="+") + parser.addoption("--schedulers", help="schedulers under test", default=["slurm"], nargs="+") + parser.addoption("--tests-log-file", help="file used to write test logs", default="pytest.log") + parser.addoption("--output-dir", help="output dir for tests artifacts") + # Can't mark fields as required due to: https://github.com/pytest-dev/pytest/issues/2026 + parser.addoption("--key-name", help="key to use for EC2 instances", type=str) + parser.addoption("--key-path", help="key path to use for SSH connections", type=str) + parser.addoption("--custom-chef-cookbook", help="url to a custom cookbook package") + parser.addoption("--custom-awsbatch-template-url", help="url to a custom awsbatch template") + parser.addoption("--template-url", help="url to a custom cfn template") + parser.addoption("--custom-awsbatchcli-package", help="url to a custom awsbatch cli package") + parser.addoption("--custom-node-package", help="url to a custom node package") + + +def pytest_generate_tests(metafunc): + """Generate (multiple) parametrized calls to a test function.""" + _parametrize_from_option(metafunc, "instance", "instances") + _parametrize_from_option(metafunc, "region", "regions") + _parametrize_from_option(metafunc, "os", "oss") + _parametrize_from_option(metafunc, "scheduler", "schedulers") + + +def pytest_configure(config): + """This hook is called for every plugin and initial conftest file after command line options have been parsed.""" + # register additional markers + config.addinivalue_line("markers", "instances(instances_list): run test only against the listed instances.") + config.addinivalue_line("markers", "regions(regions_list): run test only against the listed regions") + config.addinivalue_line("markers", "oss(os_list): run test only against the listed oss") + config.addinivalue_line("markers", "schedulers(schedulers_list): run test only against the listed schedulers") + config.addinivalue_line( + "markers", "dimensions(region, instance, os, scheduler): run test only against the listed dimensions" + ) + config.addinivalue_line("markers", "skip_instances(instances_list): skip test for the listed instances") + config.addinivalue_line("markers", "skip_regions(regions_list): skip test for the listed regions") + config.addinivalue_line("markers", "skip_oss(os_list): skip test for the listed oss") + config.addinivalue_line("markers", "skip_schedulers(schedulers_list): skip test for the listed schedulers") + config.addinivalue_line( + "markers", "skip_dimensions(region, instance, os, scheduler): skip test for the listed dimensions" + ) + + _setup_custom_logger(config.getoption("tests_log_file")) + + +def pytest_runtest_call(item): + """Called to execute the test item.""" + _add_properties_to_report(item) + add_default_markers(item) + + check_marker_list(item, "instances", "instance") + check_marker_list(item, "regions", "region") + check_marker_list(item, "oss", "os") + check_marker_list(item, "schedulers", "scheduler") + check_marker_skip_list(item, "skip_instances", "instance") + check_marker_skip_list(item, "skip_regions", "region") + check_marker_skip_list(item, "skip_oss", "os") + check_marker_skip_list(item, "skip_schedulers", "scheduler") + check_marker_dimensions(item) + check_marker_skip_dimensions(item) + + logging.info("Running test " + item.name) + + +def pytest_collection_modifyitems(items): + """Called after collection has been performed, may filter or re-order the items in-place.""" + _add_filename_markers(items) + + +def pytest_exception_interact(node, call, report): + """Called when an exception was raised which can potentially be interactively handled..""" + logging.error("Exception raised while executing {0}: {1}".format(node.name, call.excinfo)) + + +def _add_filename_markers(items): + """Add a marker based on the name of the file where the test case is defined.""" + for item in items: + test_location = os.path.splitext(os.path.basename(item.location[0]))[0] + marker = re.sub(r"test_|_test", "", test_location) + item.add_marker(marker) + + +def _parametrize_from_option(metafunc, test_arg_name, option_name): + if test_arg_name in metafunc.fixturenames: + metafunc.parametrize(test_arg_name, metafunc.config.getoption(option_name), scope="class") + + +def _setup_custom_logger(log_file): + formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(module)s - %(message)s") + logger = logging.getLogger() + logger.handlers = [] + + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + logger.setLevel(logging.INFO) + logger.addHandler(console_handler) + + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + +def _add_properties_to_report(item): + for dimension in DIMENSIONS_MARKER_ARGS: + value = item.funcargs.get(dimension) + if value: + item.user_properties.append((dimension, value)) + + +@pytest.fixture(scope="class") +def clusters_factory(request): + """ + Define a fixture to manage the creation and destruction of clusters. + + The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config + """ + factory = ClustersFactory() + + def _cluster_factory(cluster_config): + cluster_config = _write_cluster_config_to_outdir(request, cluster_config) + cluster = Cluster( + name="integ-tests-" + random_alphanumeric(), + config_file=cluster_config, + ssh_key=request.config.getoption("key_path"), + ) + factory.create_cluster(cluster) + return cluster + + yield _cluster_factory + factory.destroy_all_clusters() + + +def _write_cluster_config_to_outdir(request, cluster_config): + out_dir = request.config.getoption("output_dir") + os.makedirs("{0}/clusters_configs".format(out_dir), exist_ok=True) + cluster_config_dst = "{out_dir}/clusters_configs/{test_name}.config".format( + out_dir=out_dir, test_name=request.node.nodeid + ) + copyfile(cluster_config, cluster_config_dst) + return cluster_config_dst + + +@pytest.fixture() +def test_datadir(request, datadir): + """ + Inject the datadir with resources for the specific test function. + + If the test function is declared in a class then datadir is ClassName/FunctionName + otherwise it is only FunctionName. + """ + function_name = request.function.__name__ + if not request.cls: + return datadir / function_name + + class_name = request.cls.__name__ + return datadir / "{0}/{1}".format(class_name, function_name) + + +@pytest.fixture() +def pcluster_config_reader(test_datadir, vpc_stacks, region, request): + """ + Define a fixture to render pcluster config templates associated to the running test. + + The config for a given test is a pcluster.config.ini file stored in the configs_datadir folder. + The config can be written by using Jinja2 template engine. + The current renderer already replaces placeholders for current keys: + {{ region }}, {{ os }}, {{ instance }}, {{ scheduler}}, {{ key_name }}, + {{ vpc_id }}, {{ public_subnet_id }}, {{ private_subnet_id }} + The current renderer injects options for custom templates and packages in case these + are passed to the cli and not present already in the cluster config. + + :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template + """ + config_file = "pcluster.config.ini" + + def _config_renderer(**kwargs): + config_file_path = test_datadir / config_file + _add_custom_packages_configs(config_file_path, request) + default_values = _get_default_template_values(vpc_stacks, region, request) + file_loader = FileSystemLoader(str(test_datadir)) + env = Environment(loader=file_loader) + rendered_template = env.get_template(config_file).render(**{**kwargs, **default_values}) + config_file_path.write_text(rendered_template) + return config_file_path + + return _config_renderer + + +def _add_custom_packages_configs(cluster_config, request): + config = configparser.ConfigParser() + config.read(cluster_config) + cluster_template = "cluster {0}".format(config.get("global", "cluster_template", fallback="default")) + + for custom_option in ["template_url", "custom_awsbatch_template_url", "custom_chef_cookbook"]: + if request.config.getoption(custom_option) and custom_option not in config[cluster_template]: + config[cluster_template][custom_option] = request.config.getoption(custom_option) + + extra_json = json.loads(config.get(cluster_template, "extra_json", fallback="{}")) + for extra_json_custom_option in ["custom_awsbatchcli_package", "custom_node_package"]: + if request.config.getoption(extra_json_custom_option): + cluster = extra_json.get("cluster", {}) + if extra_json_custom_option not in cluster: + cluster[extra_json_custom_option] = request.config.getoption(extra_json_custom_option) + extra_json["cluster"] = cluster + if extra_json: + config[cluster_template]["extra_json"] = json.dumps(extra_json) + + with cluster_config.open(mode="w") as f: + config.write(f) + + +def _get_default_template_values(vpc_stacks, region, request): + """Build a dictionary of default values to inject in the jinja templated cluster configs.""" + default_values = {dimension: request.node.funcargs.get(dimension) for dimension in DIMENSIONS_MARKER_ARGS} + default_values["key_name"] = request.config.getoption("key_name") + vpc = vpc_stacks[region] + for key, value in vpc.cfn_outputs.items(): + default_values[to_snake_case(key)] = value + return default_values + + +@pytest.fixture(scope="session") +def cfn_stacks_factory(): + """Define a fixture to manage the creation and destruction of CloudFormation stacks.""" + factory = CfnStacksFactory() + yield factory + factory.delete_all_stacks() + + +@pytest.fixture(scope="session", autouse=True) +def vpc_stacks(cfn_stacks_factory, request): + """Create VPC used by integ tests in all configured regions.""" + public_subnet = SubnetConfig( + name="PublicSubnet", + cidr="10.0.0.0/24", + map_public_ip_on_launch=True, + has_nat_gateway=True, + default_gateway=Gateways.INTERNET_GATEWAY, + ) + private_subnet = SubnetConfig( + name="PrivateSubnet", + cidr="10.0.1.0/24", + map_public_ip_on_launch=False, + has_nat_gateway=False, + default_gateway=Gateways.NAT_GATEWAY, + ) + vpc_config = VPCConfig(subnets=[public_subnet, private_subnet]) + template = VPCTemplateBuilder(vpc_config).build() + + regions = request.config.getoption("regions") + vpc_stacks = {} + for region in regions: + stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) + cfn_stacks_factory.create_stack(stack) + vpc_stacks[region] = stack + + return vpc_stacks + + +@pytest.fixture(scope="function") +def s3_bucket_factory(region): + """ + Define a fixture to create S3 buckets. + :param region: region where the test is running + :return: a function to create buckets. + """ + created_buckets = [] + + def _create_bucket(): + bucket_name = "integ-tests-" + random_alphanumeric() + logging.info("Creating S3 bucket {0}".format(bucket_name)) + create_s3_bucket(bucket_name, region) + created_buckets.append((bucket_name, region)) + return bucket_name + + yield _create_bucket + + for bucket in created_buckets: + logging.info("Deleting S3 bucket {0}".format(bucket[0])) + try: + delete_s3_bucket(bucket_name=bucket[0], region=bucket[1]) + except Exception as e: + logging.error("Failed deleting bucket {0} with exception: {1}".format(bucket[0], e)) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py new file mode 100644 index 0000000000..79a892cc6e --- /dev/null +++ b/tests/integration-tests/conftest_markers.py @@ -0,0 +1,202 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import pytest + +DIMENSIONS_MARKER_ARGS = ["region", "instance", "os", "scheduler"] +UNSUPPORTED_DIMENSIONS = [("eu-north-1", "c4.xlarge", "*", "*"), ("eu-west-3", "c4.xlarge", "*", "*")] + + +class InvalidMarkerError(Exception): + """Error raised with marker is invalid""" + + pass + + +def add_default_markers(item): + """ + Add default markers for dimensions that need to be skipped by default for all tests. + + :param item: pytest Item object markers are applied to. + """ + for dimensions in UNSUPPORTED_DIMENSIONS: + item.add_marker(pytest.mark.skip_dimensions(*dimensions)) + + +def check_marker_list(item, marker_name, arg_name): + """ + Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name + not listed in the list passed as first argument to the marker. + + Example: + @pytest.mark.marker_name(["value1", "value2"]) + def test(arg_name) + + The test is executed only if arg_name is equal to "value1" or "value2". + + :param item: pytest Item object annotated with markers. + :param marker_name: name of the marker to process. + :param arg_name: arg name the marker values should be compared to. + """ + arg_value = item.funcargs.get(arg_name) + allowed_values = [] + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_list"], len(marker.args)) + allowed_values.extend(marker.args[0]) + + if not allowed_values or arg_value in allowed_values: + return + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is not in {marker} allowed values: " + "{allowed_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + allowed_values=allowed_values, + ) + ) + logging.info(skip_message) + pytest.skip(skip_message) + + +def check_marker_skip_list(item, marker_name, arg_name): + """ + Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name + listed in the list passed as first argument to the marker. + + Example: + @pytest.mark.marker_name(["value1", "value2"]) + def test(arg_name) + + The test is executed only if arg_name is not equal to "value1" or "value2". + + :param item: pytest Item object annotated with markers. + :param marker_name: name of the marker to process. + :param arg_name: arg name the marker values should be compared to. + """ + arg_value = item.funcargs.get(arg_name) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_skip_list"], len(marker.args)) + skip_values = marker.args[0] + if arg_value in skip_values: + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is in {marker} allowed values:" + "{skip_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + skip_values=skip_values, + ) + ) + logging.info(skip_message) + pytest.skip(skip_message) + + +def check_marker_skip_dimensions(item): + """ + Skip all tests that are annotated with @pytest.mark.skip_dimensions and have the args + (region, instance, os, scheduler) match those specified in the marker. + + "*" can be used to identify all values for a specific argument. + + Example: + @pytest.mark.skip_dimensions("a", "b", "*", "d") + def test(region, instance, os, scheduler) + + The test is executed only if the test args (region, instance, os, scheduler) do not match + ("a", "b", "*", "d") + + :param item: pytest Item object annotated with markers. + """ + marker_name = "skip_dimensions" + args_values = [] + for dimension in DIMENSIONS_MARKER_ARGS: + args_values.append(item.funcargs.get(dimension)) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + if len(marker.args) != len(DIMENSIONS_MARKER_ARGS): + logging.error( + "Marker {marker_name} requires the following args: {args}".format( + marker_name=marker_name, args=DIMENSIONS_MARKER_ARGS + ) + ) + raise ValueError + dimensions_match = _compare_dimension_lists(args_values, marker.args) + if dimensions_match: + skip_message = ( + "Skipping test {test_name} because dimensions {args_values} match {marker}: " + "{skip_values}".format( + test_name=item.name, args_values=args_values, marker=marker_name, skip_values=marker.args + ) + ) + logging.info(skip_message) + pytest.skip(skip_message) + + +def check_marker_dimensions(item): + """ + Execute all tests that are annotated with @pytest.mark.dimensions and have the args + (region, instance, os, scheduler) match those specified in the marker. + + "*" can be used to identify all values for a specific argument. + + Example: + @pytest.mark.dimensions("a", "b", "*", "d") + def test(region, instance, os, scheduler) + + The test is executed only if the test args (region, instance, os, scheduler) match ("a", "b", "*", "d") + + :param item: pytest Item object annotated with markers. + """ + marker_name = "dimensions" + test_args_value = [] + for dimension in DIMENSIONS_MARKER_ARGS: + test_args_value.append(item.funcargs.get(dimension)) + allowed_values = [] + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + allowed_values.append(marker.args) + dimensions_match = _compare_dimension_lists(test_args_value, marker.args) + if dimensions_match: + return + + if allowed_values: + skip_message = ( + "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker} values: " + "{allowed_values}".format( + test_name=item.name, test_args_value=test_args_value, marker=marker_name, allowed_values=allowed_values + ) + ) + logging.info(skip_message) + pytest.skip(skip_message) + + +def _validate_marker(marker_name, expected_args, args_count): + if args_count != len(expected_args): + logging.error( + "Marker {marker_name} requires the following args: {args}".format( + marker_name=marker_name, args=expected_args + ) + ) + raise InvalidMarkerError + + +def _compare_dimension_lists(list1, list2): + if len(list1) != len(list2): + return False + for d1, d2 in zip(list1, list2): + if d1 != "*" and d2 != "*" and d1 != d2: + return False + return True diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py new file mode 100644 index 0000000000..d8b014ebf4 --- /dev/null +++ b/tests/integration-tests/remote_command_executor.py @@ -0,0 +1,111 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import os +from typing import NamedTuple + +from paramiko import AutoAddPolicy, SSHClient + + +class RemoteCommandResult(NamedTuple): + """Wrap the results from a remote command execution.""" + + return_code: int = 0 + stdout: str = "" + stderr: str = "" + + +class RemoteCommandExecutionError(Exception): + """Signal a failure in remote command execution.""" + + pass + + +class RemoteCommandExecutor: + """Execute remote commands on the cluster master node.""" + + USERNAMES = { + "alinux": "ec2-user", + "centos6": "centos", + "centos7": "centos", + "ubuntu1404": "ubuntu", + "ubuntu1604": "ubuntu", + } + + def __init__(self, cluster): + self.__ssh_client = SSHClient() + self.__ssh_client.load_system_host_keys() + self.__ssh_client.set_missing_host_key_policy(AutoAddPolicy()) + self.__ssh_client.connect( + hostname=cluster.master_ip, username=self.USERNAMES[cluster.os], key_filename=cluster.ssh_key + ) + self.__sftp_client = self.__ssh_client.open_sftp() + self.__user_at_hostname = "{0}@{1}".format(self.USERNAMES[cluster.os], cluster.master_ip) + + def __del__(self): + try: + self.__ssh_client.close() + except Exception as e: + # Catch all exceptions if we fail to close the clients + logging.warning("Exception raised when closing remote clients: {0}".format(e)) + + def run_remote_command(self, command, log_error=True, additional_files=None, raise_on_error=True): + """ + Execute remote command on the cluster master node. + + :param command: command to execute. + :param log_error: log errors. + :param additional_files: additional files to copy before executing script. + :return: result of the execution. + """ + if isinstance(command, list): + command = " ".join(command) + self._copy_additional_files(additional_files) + logging.info("Executing remote command command on {0}: {1}".format(self.__user_at_hostname, command)) + stdin, stdout, stderr = self.__ssh_client.exec_command(command, get_pty=True) + result = RemoteCommandResult( + return_code=stdout.channel.recv_exit_status(), + stdout="\n".join(stdout.read().decode().splitlines()), + stderr="\n".join(stderr.read().decode().splitlines()), + ) + if result.return_code != 0 and raise_on_error: + if log_error: + logging.error( + "Command {0} failed with error:\n{1}\nand output:\n{2}".format( + command, result.stderr, result.stdout + ) + ) + raise RemoteCommandExecutionError + return result + + def run_remote_script(self, script_file, args=None, log_error=True, additional_files=None): + """ + Execute a script remotely on the cluster master node. + + Script is copied to the master home dir before being executed. + :param script_file: local path to the script to execute remotely. + :param args: args to pass to the script when invoked. + :param log_error: log errors. + :param additional_files: additional files to copy before executing script. + :return: result of the execution. + """ + script_name = os.path.basename(script_file) + self.__sftp_client.put(script_file, script_name) + if not args: + args = [] + return self.run_remote_command( + ["/bin/bash", "--login", script_name] + args, log_error=log_error, additional_files=additional_files + ) + + def _copy_additional_files(self, files): + for file in files or []: + self.__sftp_client.put(file, os.path.basename(file)) diff --git a/tests/integration-tests/reports_generator.py b/tests/integration-tests/reports_generator.py new file mode 100644 index 0000000000..89eb6d1bc7 --- /dev/null +++ b/tests/integration-tests/reports_generator.py @@ -0,0 +1,80 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import json +import os +from xml.etree import ElementTree + +from junitparser import JUnitXml + + +def generate_junitxml_merged_report(test_results_dir): + """ + Merge all junitxml generated reports in a single one. + :param test_results_dir: output dir containing the junitxml reports to merge. + """ + merged_xml = None + for dir, _, files in os.walk(test_results_dir): + for file in files: + if file.endswith("results.xml"): + if not merged_xml: + merged_xml = JUnitXml.fromfile(os.path.join(dir, file)) + else: + merged_xml += JUnitXml.fromfile(os.path.join(dir, file)) + + merged_xml.write("{0}/test_report.xml".format(test_results_dir), pretty=True) + + +def generate_json_report(test_results_dir): + """ + Generate a json report containing a summary of the tests results with details + for each dimension. + :param test_results_dir: dir containing the tests outputs. + :return: a dictionary containing the computed report. + """ + test_report_file = os.path.join(test_results_dir, "test_report.xml") + if not os.path.isfile(test_report_file): + generate_junitxml_merged_report(test_results_dir) + + root = ElementTree.parse(test_report_file).getroot() + results = { + "all": { + "total": int(root.get("tests")), + "skipped": int(root.get("skipped")), + "failures": int(root.get("failures")), + "errors": int(root.get("errors")), + } + } + _record_results(results, root, "./testcase[skipped]/properties/property", "skipped") + _record_results(results, root, "./testcase[failure]/properties/property", "failures") + _record_results(results, root, "./testcase[error]/properties/property", "errors") + _record_results(results, root, "./testcase/properties/property", "total") + + with open("{0}/test_report.json".format(test_results_dir), "w") as out_f: + out_f.write(json.dumps(results, indent=4)) + + return results + + +def _record_results(results_dict, results_xml_root, xpath_exp, label): + for skipped in results_xml_root.findall(xpath_exp): + if not skipped.get("name") in results_dict: + results_dict[skipped.get("name")] = {} + if not skipped.get("value") in results_dict[skipped.get("name")]: + results_dict[skipped.get("name")].update({skipped.get("value"): _empty_results_dict()}) + results_dict[skipped.get("name")][skipped.get("value")][label] += 1 + + +def _empty_results_dict(): + return {"total": 0, "skipped": 0, "failures": 0, "errors": 0} + + +# generate_tabular_report("1549489575.329696.out", None, None, None, None) diff --git a/tests/integration-tests/requirements.txt b/tests/integration-tests/requirements.txt new file mode 100644 index 0000000000..3a7245e5a4 --- /dev/null +++ b/tests/integration-tests/requirements.txt @@ -0,0 +1,14 @@ +argparse +assertpy +aws-parallelcluster +boto3 +jinja2 +junitparser +paramiko +pytest +pytest-datadir +pytest-html +pytest-rerunfailures +pytest-xdist +retrying +troposphere diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py new file mode 100644 index 0000000000..80bedf8fa8 --- /dev/null +++ b/tests/integration-tests/test_runner.py @@ -0,0 +1,301 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import datetime +import logging +import multiprocessing +import os +import sys +import time + +import argparse +import pytest + +from reports_generator import generate_json_report, generate_junitxml_merged_report + +logger = logging.getLogger() +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(module)s - %(message)s", level=logging.INFO) + +START_TIME = time.time() +START_TIME_ISO = datetime.datetime.fromtimestamp(START_TIME).isoformat() + +LOGS_DIR = "{0}.logs".format(START_TIME) +OUT_DIR = "{0}.out".format(START_TIME) + +TEST_DEFAULTS = { + "parallelism": None, + "retry_on_failures": False, + "features": "", # empty string means all + "regions": [ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + "ca-central-1", + "eu-west-1", + "eu-west-2", + "eu-central-1", + "ap-southeast-1", + "ap-southeast-2", + "ap-northeast-1", + "ap-south-1", + "sa-east-1", + "eu-west-3", + ], + "oss": ["alinux", "centos6", "centos7", "ubuntu1404", "ubuntu1604"], + "schedulers": ["sge", "slurm", "torque", "awsbatch"], + "instances": ["c4.xlarge", "c5.xlarge"], + "dry_run": False, + "reports": [], + "sequential": False, + "output_dir": "tests_outputs", + "custom_node_url": None, + "custom_cookbook_url": None, + "custom_template_url": None, + "custom_awsbatch_template_url": None, + "custom_awsbatchcli_url": None, +} + + +def _init_argparser(): + parser = argparse.ArgumentParser( + description="Run integration tests suite.", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-f", + "--features", + help="Run only tests for the listed features. Prepending the not keyword to the feature name causes the " + "feature to be excluded.", + default=TEST_DEFAULTS.get("features"), + nargs="+", + ) + parser.add_argument( + "-r", "--regions", help="AWS region where tests are executed.", default=TEST_DEFAULTS.get("regions"), nargs="+" + ) + parser.add_argument( + "-i", "--instances", help="AWS instances under test.", default=TEST_DEFAULTS.get("instances"), nargs="+" + ) + parser.add_argument("-o", "--oss", help="OSs under test.", default=TEST_DEFAULTS.get("oss"), nargs="+") + parser.add_argument( + "-s", "--schedulers", help="Schedulers under test.", default=TEST_DEFAULTS.get("schedulers"), nargs="+" + ) + parser.add_argument( + "-n", "--parallelism", help="Tests parallelism for every region.", default=TEST_DEFAULTS.get("parallelism") + ) + parser.add_argument( + "--retry-on-failures", + help="Retry once more the failed tests after a delay of 60 seconds.", + action="store_true", + default=TEST_DEFAULTS.get("retry_on_failures"), + ) + parser.add_argument( + "--dry-run", + help="Only show the list of tests that would run with specified options.", + action="store_true", + default=TEST_DEFAULTS.get("dry_run"), + ) + parser.add_argument( + "--show-output", + help="Do not redirect tests stdout to file. Not recommended when running in multiple regions.", + action="store_true", + default=TEST_DEFAULTS.get("show_output"), + ) + parser.add_argument( + "--sequential", + help="Run tests in a single process. When not specified tests will run concurrently in all regions.", + action="store_true", + default=TEST_DEFAULTS.get("sequential"), + ) + parser.add_argument( + "--reports", + help="create tests report files. junitxml creates a junit-xml style report file. html creates an html " + "style report file. json creates a summary with details for each dimensions", + nargs="+", + choices=["html", "junitxml", "json"], + default=TEST_DEFAULTS.get("reports"), + ) + parser.add_argument("--key-name", help="Key to use for EC2 instances", required=True) + parser.add_argument("--key-path", help="Path to the key to use for SSH connections", required=True, type=_is_file) + parser.add_argument( + "--output-dir", help="Directory where tests outputs are generated", default=TEST_DEFAULTS.get("output_dir") + ) + parser.add_argument( + "--custom-node-url", help="URL to a custom node package.", default=TEST_DEFAULTS.get("custom_node_url") + ) + parser.add_argument( + "--custom-cookbook-url", + help="URL to a custom cookbook package.", + default=TEST_DEFAULTS.get("custom_cookbook_url"), + ) + parser.add_argument( + "--custom-template-url", help="URL to a custom cfn template.", default=TEST_DEFAULTS.get("custom_template_url") + ) + parser.add_argument( + "--custom-awsbatch-template-url", + help="URL to a custom awsbatch cfn template.", + default=TEST_DEFAULTS.get("custom_awsbatch_template_url"), + ) + parser.add_argument( + "--custom-awsbatchcli-url", + help="URL to a custom awsbatch cli package.", + default=TEST_DEFAULTS.get("custom_awsbatchcli_url"), + ) + + return parser + + +def _is_file(value): + if not os.path.isfile(value): + raise argparse.ArgumentTypeError("'{0}' is not a valid key".format(value)) + return value + + +def _get_pytest_args(args, regions, log_file, out_dir): + pytest_args = ["-s", "-vv", "-l", "--rootdir=./tests"] + # Show all tests durations + pytest_args.append("--durations=0") + # Run only tests with the given markers + pytest_args.append("-m") + pytest_args.append(" or ".join(args.features)) + pytest_args.append("--regions") + pytest_args.extend(regions) + pytest_args.append("--instances") + pytest_args.extend(args.instances) + pytest_args.append("--oss") + pytest_args.extend(args.oss) + pytest_args.append("--schedulers") + pytest_args.extend(args.schedulers) + pytest_args.extend(["--tests-log-file", "{0}/{1}".format(args.output_dir, log_file)]) + pytest_args.extend(["--output-dir", "{0}/{1}".format(args.output_dir, out_dir)]) + pytest_args.extend(["--key-name", args.key_name]) + pytest_args.extend(["--key-path", args.key_path]) + + if args.retry_on_failures: + # Rerun tests on failures for one more time after 60 seconds delay + pytest_args.extend(["--reruns", "1", "--reruns-delay", "60"]) + + if args.parallelism: + pytest_args.extend(["-n", args.parallelism]) + + if args.dry_run: + pytest_args.append("--collect-only") + + if "junitxml" in args.reports or "json" in args.reports: + pytest_args.append("--junit-xml={0}/{1}/results.xml".format(args.output_dir, out_dir)) + + if "html" in args.reports: + pytest_args.append("--html={0}/{1}/results.html".format(args.output_dir, out_dir)) + + _set_custom_packages_args(args, pytest_args) + + return pytest_args + + +def _set_custom_packages_args(args, pytest_args): + if args.custom_node_url: + pytest_args.extend(["--custom-node-package", args.custom_node_url]) + + if args.custom_cookbook_url: + pytest_args.extend(["--custom-chef-cookbook", args.custom_cookbook_url]) + + if args.custom_template_url: + pytest_args.extend(["--template-url", args.custom_template_url]) + + if args.custom_awsbatch_template_url: + pytest_args.extend(["--custom-awsbatch-template-url", args.custom_awsbatch_template_url]) + + if args.custom_awsbatchcli_url: + pytest_args.extend(["--custom-awsbatchcli-package", args.custom_awsbatchcli_url]) + + +def _get_pytest_regionalized_args(region, args): + return _get_pytest_args( + args=args, + regions=[region], + log_file="{0}/{1}.log".format(LOGS_DIR, region), + out_dir="{0}/{1}".format(OUT_DIR, region), + ) + + +def _get_pytest_non_regionalized_args(args): + return _get_pytest_args( + args=args, regions=args.regions, log_file="{0}/all_regions.log".format(LOGS_DIR), out_dir=OUT_DIR + ) + + +def _run_test_in_region(region, args): + out_dir = "{base_dir}/{out_dir}/{region}".format(base_dir=args.output_dir, out_dir=OUT_DIR, region=region) + os.makedirs(out_dir, exist_ok=True) + + # Redirect stdout to file + if not args.show_output: + sys.stdout = open("{0}/pytest.out".format(out_dir), "w") + + pytest_args_regionalized = _get_pytest_regionalized_args(region, args) + logger.info("Starting tests in region {0} with params {1}".format(region, pytest_args_regionalized)) + pytest.main(pytest_args_regionalized) + + +def _make_logging_dirs(base_dir): + logs_dir = "{base_dir}/{logs_dir}".format(base_dir=base_dir, logs_dir=LOGS_DIR) + os.makedirs(logs_dir, exist_ok=True) + logger.info("Configured logs dir: {0}".format(logs_dir)) + out_dir = "{base_dir}/{out_dir}".format(base_dir=base_dir, out_dir=OUT_DIR) + os.makedirs(out_dir, exist_ok=True) + logger.info("Configured tests output dir: {0}".format(out_dir)) + + +def _run_parallel(args): + jobs = [] + for region in args.regions: + p = multiprocessing.Process(target=_run_test_in_region, args=[region, args]) + jobs.append(p) + p.start() + + for job in jobs: + job.join() + + +def _run_sequential(args): + # Redirect stdout to file + if not args.show_output: + sys.stdout = open("{0}/{1}/pytest.out".format(args.output_dir, OUT_DIR), "w") + + pytest_args_non_regionalized = _get_pytest_non_regionalized_args(args) + logger.info("Starting tests with params {0}".format(pytest_args_non_regionalized)) + pytest.main(pytest_args_non_regionalized) + + +def main(): + """Entrypoint for tests executor.""" + args = _init_argparser().parse_args() + logger.info("Starting tests with parameters {0}".format(args)) + + _make_logging_dirs(args.output_dir) + + if args.sequential: + _run_sequential(args) + else: + _run_parallel(args) + + logger.info("All tests completed!") + + reports_output_dir = "{base_dir}/{out_dir}".format(base_dir=args.output_dir, out_dir=OUT_DIR) + if "junitxml" in args.reports: + generate_junitxml_merged_report(reports_output_dir) + + if "json" in args.reports: + logger.info("Generating tests report") + generate_json_report(reports_output_dir) + + +if __name__ == "__main__": + main() diff --git a/tests/integration-tests/tests/__init__.py b/tests/integration-tests/tests/__init__.py new file mode 100644 index 0000000000..221b7a2eca --- /dev/null +++ b/tests/integration-tests/tests/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/integration-tests/tests/test_awsbatch.py b/tests/integration-tests/tests/test_awsbatch.py new file mode 100644 index 0000000000..a66ffbaab6 --- /dev/null +++ b/tests/integration-tests/tests/test_awsbatch.py @@ -0,0 +1,120 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import re + +import pytest +from retrying import retry + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from time_utils import minutes, seconds + + +@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.instances(["c5.xlarge", "t2.large"]) +@pytest.mark.dimensions("*", "*", "alinux", "awsbatch") +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir): + """ + Test all AWS Batch related features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + cluster_config = pcluster_config_reader() + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_simple_job_submission(remote_command_executor, test_datadir) + _test_array_submission(remote_command_executor) + _test_mnp_submission(remote_command_executor, test_datadir) + _test_job_kill(remote_command_executor) + + +def _test_simple_job_submission(remote_command_executor, test_datadir): + logging.info("Testing inline submission.") + _test_job_submission(remote_command_executor, "awsbsub --vcpus 2 --memory 256 --timeout 60 sleep 1") + + # FIXME: uncomment once this bug is fixed + # logging.info("Testing inline submission with env.") + # _test_job_submission( + # remote_command_executor, + # 'export TEST=test && awsbsub --vcpus 2 --memory 256 --timeout 60 -e TEST "env | grep TEST=test"', + # ) + + logging.info("Testing stdin submission with env") + _test_job_submission( + remote_command_executor, + 'export TEST=test && echo "env | grep TEST=test" | awsbsub --vcpus 2 --memory 256 --timeout 60 -e TEST', + ) + + logging.info("Testing command file with env") + _test_job_submission( + remote_command_executor, + "export TEST=test && awsbsub --vcpus 2 --memory 256 --timeout 60 -e TEST -cf test_simple_job.sh", + [str(test_datadir / "test_simple_job.sh")], + ) + + +def _test_array_submission(remote_command_executor): + logging.info("Testing array submission.") + _test_job_submission(remote_command_executor, "awsbsub --vcpus 1 --memory 128 -a 4 sleep 1", children_number=4) + + +def _test_mnp_submission(remote_command_executor, test_datadir): + logging.info("Testing MNP submission with MPI job.") + _test_job_submission( + remote_command_executor, + "awsbsub --vcpus 1 --memory 128 -n 4 -cf test_mpi_job.sh", + additional_files=[str(test_datadir / "test_mpi_job.sh")], + children_number=4, + ) + + +def _test_job_kill(remote_command_executor): + logging.info("Testing job kill.") + result = remote_command_executor.run_remote_command("awsbsub --vcpus 2 --memory 256 --timeout 60 sleep 300") + job_id = _assert_job_submitted(result.stdout) + + remote_command_executor.run_remote_command("awsbkill {0}".format(job_id)) + status = _wait_job_completed(remote_command_executor, job_id) + + assert_that(status).contains_only("FAILED") + result = remote_command_executor.run_remote_command("awsbstat -d {0}".format(job_id)) + assert_that(result.stdout).matches(r"statusReason\s+: Terminated by the user") + + +def _test_job_submission(remote_command_executor, submit_command, additional_files=None, children_number=0): + logging.debug("Submitting Batch job") + result = remote_command_executor.run_remote_command(submit_command, additional_files=additional_files) + job_id = _assert_job_submitted(result.stdout) + logging.debug("Submitted Batch job id: {0}".format(job_id)) + status = _wait_job_completed(remote_command_executor, job_id) + assert_that(status).is_length(1 + children_number) + assert_that(status).contains_only("SUCCEEDED") + + +def _assert_job_submitted(awsbsub_output): + __tracebackhide__ = True + match = re.match(r"Job ([a-z0-9\-]{36}) \(.+\) has been submitted.", awsbsub_output) + assert_that(match).is_not_none() + return match.group(1) + + +@retry( + retry_on_result=lambda result: "FAILED" not in result and any(status != "SUCCEEDED" for status in result), + wait_fixed=seconds(7), + stop_max_delay=minutes(15), +) +def _wait_job_completed(remote_command_executor, job_id): + result = remote_command_executor.run_remote_command("awsbstat -d {0}".format(job_id)) + return re.findall(r"status\s+: (.+)", result.stdout) diff --git a/tests/integration-tests/tests/test_awsbatch/test_awsbatch/pcluster.config.ini b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/pcluster.config.ini new file mode 100644 index 0000000000..360de75fd0 --- /dev/null +++ b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/pcluster.config.ini @@ -0,0 +1,20 @@ +[global] +cluster_template = awsbatch + +[aws] +aws_region_name = {{ region }} + +[cluster awsbatch] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = awsbatch +compute_instance_type = {{ instance }} +min_vcpus = 2 +desired_vcpus = 2 +max_vcpus = 40 + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_mpi_job.sh b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_mpi_job.sh new file mode 100644 index 0000000000..3b148f2544 --- /dev/null +++ b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_mpi_job.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -e + +echo "ip container: $(/sbin/ip -o -4 addr list eth0 | awk '{print $4}' | cut -d/ -f1)" +echo "ip host: $(curl -s "http://169.254.169.254/latest/meta-data/local-ipv4")" + +# get shared dir +IFS=',' _shared_dirs=(${PCLUSTER_SHARED_DIRS}) +_shared_dir=${_shared_dirs[0]} +_job_dir="${_shared_dir}/${AWS_BATCH_JOB_ID%#*}-${AWS_BATCH_JOB_ATTEMPT}" +_exit_code_file="${_job_dir}/batch-exit-code" + +if [[ "${AWS_BATCH_JOB_NODE_INDEX}" -eq "${AWS_BATCH_JOB_MAIN_NODE_INDEX}" ]]; then + echo "Hello I'm the main node $(hostname)! I run the mpi job!" + + mkdir -p "${_job_dir}" + + echo "Writing mpi code..." + cat > "${_shared_dir}/mpi_hello_world.c" << EOF +// Copyright 2011 www.mpitutorial.com +// +// An intro MPI hello world program that uses MPI_Init, MPI_Comm_size, +// MPI_Comm_rank, MPI_Finalize, and MPI_Get_processor_name. +// +#include +#include +#include + +int main(int argc, char** argv) { + // Initialize the MPI environment. The two arguments to MPI Init are not + // currently used by MPI implementations, but are there in case future + // implementations might need the arguments. + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + // Print off a hello world message + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + + // Finalize the MPI environment. No more MPI calls can be made after this + MPI_Finalize(); +} +EOF + + echo "Compiling..." + /usr/lib64/openmpi/bin/mpicc -o "${_job_dir}/mpi_hello_world" "${_shared_dir}/mpi_hello_world.c" + + echo "Running..." + /usr/lib64/openmpi/bin/mpirun --mca btl_tcp_if_include eth0 --allow-run-as-root --machinefile "${HOME}/hostfile" "${_job_dir}/mpi_hello_world" + + # Write exit status code + echo "0" > "${_exit_code_file}" + # Waiting for compute nodes to terminate + sleep 30 +else + echo "Hello I'm a compute node $(hostname)! I let the main node orchestrate the mpi execution!" + # Since mpi orchestration happens on the main node, we need to make sure the containers representing the compute + # nodes are not terminated. A simple trick is to wait for a file containing the status code to be created. + # All compute nodes are terminated by Batch if the main node exits abruptly. + while [ ! -f "${_exit_code_file}" ]; do + sleep 2 + done + exit $(cat "${_exit_code_file}") +fi diff --git a/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_simple_job.sh b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_simple_job.sh new file mode 100644 index 0000000000..7f770c7997 --- /dev/null +++ b/tests/integration-tests/tests/test_awsbatch/test_awsbatch/test_simple_job.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +echo "Executing Test Script" +env | grep TEST=test diff --git a/tests/integration-tests/tests/test_fsx_lustre.py b/tests/integration-tests/tests/test_fsx_lustre.py new file mode 100644 index 0000000000..87411376ca --- /dev/null +++ b/tests/integration-tests/tests/test_fsx_lustre.py @@ -0,0 +1,116 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import re + +import boto3 +import pytest +from retrying import retry + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from time_utils import minutes, seconds + + +@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.oss(["centos7"]) +@pytest.mark.schedulers(["sge"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_fsx_lustre(region, pcluster_config_reader, clusters_factory, s3_bucket_factory, test_datadir): + """ + Test all FSx Lustre related features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + mount_dir = "/fsx_mount_dir" + bucket_name = s3_bucket_factory() + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bucket.upload_file(str(test_datadir / "s3_test_file"), "s3_test_file") + cluster_config = pcluster_config_reader(bucket_name=bucket_name, mount_dir=mount_dir) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_fsx_lustre_correctly_mounted(remote_command_executor, mount_dir) + _test_import_path(remote_command_executor, mount_dir) + _test_fsx_lustre_correctly_shared(remote_command_executor, mount_dir) + _test_export_path(remote_command_executor, mount_dir, bucket_name) + + +def _test_fsx_lustre_correctly_mounted(remote_command_executor, mount_dir): + logging.info("Testing fsx lustre is correctly mounted") + result = remote_command_executor.run_remote_command("df -h -t lustre --output=source,size,target | tail -n +2") + assert_that(result.stdout).matches(r"[0-9\.]+@tcp:/fsx\s+3\.4T\s+{mount_dir}".format(mount_dir=mount_dir)) + + result = remote_command_executor.run_remote_command("cat /etc/fstab") + assert_that(result.stdout).matches( + r"fs-[0-9a-z]+\.fsx\.[a-z1-9\-]+\.amazonaws\.com@tcp:/fsx {mount_dir} lustre defaults,_netdev 0 0".format( + mount_dir=mount_dir + ) + ) + + +def _test_import_path(remote_command_executor, mount_dir): + logging.info("Testing fsx lustre import path") + result = remote_command_executor.run_remote_command("cat {mount_dir}/s3_test_file".format(mount_dir=mount_dir)) + assert_that(result.stdout).is_equal_to("Downloaded by FSx Lustre") + + +def _test_fsx_lustre_correctly_shared(remote_command_executor, mount_dir): + logging.info("Testing fsx lustre correctly mounted on compute nodes") + remote_command_executor.run_remote_command("touch {mount_dir}/test_file".format(mount_dir=mount_dir)) + job_command = ( + "cat {mount_dir}/s3_test_file " + "&& cat {mount_dir}/test_file " + "&& touch {mount_dir}/compute_output".format(mount_dir=mount_dir) + ) + result = remote_command_executor.run_remote_command("echo '{0}' | qsub".format(job_command)) + job_id = _assert_job_submitted(result.stdout) + _wait_job_completed(remote_command_executor, job_id) + status = _get_job_exit_status(remote_command_executor, job_id) + assert_that(status).is_equal_to("0") + remote_command_executor.run_remote_command("cat {mount_dir}/compute_output".format(mount_dir=mount_dir)) + + +def _test_export_path(remote_command_executor, mount_dir, bucket_name): + logging.info("Testing fsx lustre export path") + remote_command_executor.run_remote_command( + "echo 'Exported by FSx Lustre' > {mount_dir}/file_to_export".format(mount_dir=mount_dir) + ) + remote_command_executor.run_remote_command( + "sudo lfs hsm_archive {mount_dir}/file_to_export && sleep 5".format(mount_dir=mount_dir) + ) + remote_command_executor.run_remote_command( + "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export".format(bucket_name=bucket_name) + ) + result = remote_command_executor.run_remote_command("cat ./file_to_export") + assert_that(result.stdout).is_equal_to("Exported by FSx Lustre") + + +def _assert_job_submitted(qsub_output): + __tracebackhide__ = True + match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) + assert_that(match).is_not_none() + return match.group(1) + + +@retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(7), stop_max_delay=minutes(5)) +def _wait_job_completed(remote_command_executor, job_id): + result = remote_command_executor.run_remote_command("qacct -j {0}".format(job_id), raise_on_error=False) + return result.return_code + + +def _get_job_exit_status(remote_command_executor, job_id): + result = remote_command_executor.run_remote_command("qacct -j {0}".format(job_id)) + match = re.search(r"exit_status\s+([0-9]+)", result.stdout) + assert_that(match).is_not_none() + return match.group(1) diff --git a/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini b/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini new file mode 100644 index 0000000000..a2f4ac68cb --- /dev/null +++ b/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini @@ -0,0 +1,29 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +compute_instance_type = {{ instance }} +initial_queue_size = 1 +maintain_initial_size = true +fsx_settings = fsx +s3_read_resource = arn:aws:s3:::{{ bucket_name }}/* + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} + +[fsx fsx] +shared_dir = {{ mount_dir }} +storage_capacity = 3600 +imported_file_chunk_size = 1024 +import_path = s3://{{ bucket_name }} +export_path = s3://{{ bucket_name }}/export_dir +weekly_maintenance_start_time = 1:00:00 diff --git a/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/s3_test_file b/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/s3_test_file new file mode 100644 index 0000000000..4e9cb59256 --- /dev/null +++ b/tests/integration-tests/tests/test_fsx_lustre/test_fsx_lustre/s3_test_file @@ -0,0 +1 @@ +Downloaded by FSx Lustre diff --git a/tests/integration-tests/time_utils.py b/tests/integration-tests/time_utils.py new file mode 100644 index 0000000000..90cd6e13e2 --- /dev/null +++ b/tests/integration-tests/time_utils.py @@ -0,0 +1,21 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + + +def minutes(min): + """Convert minutes to milliseconds.""" + return min * seconds(60) + + +def seconds(sec): + """Convert seconds to milliseconds""" + return sec * 1000 diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py new file mode 100644 index 0000000000..d310948649 --- /dev/null +++ b/tests/integration-tests/utils.py @@ -0,0 +1,102 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import random +import re +import shlex +import string +import subprocess + +import boto3 +from retrying import retry + + +def retry_if_subprocess_error(exception): + """Return True if we should retry (in this case when it's a CalledProcessError), False otherwise""" + return isinstance(exception, subprocess.CalledProcessError) + + +def run_command(command, capture_output=True, log_error=True): + """Execute shell command.""" + if isinstance(command, str): + command = shlex.split(command) + logging.info("Executing command: " + " ".join(command)) + result = subprocess.run(command, capture_output=capture_output, universal_newlines=True, encoding="utf-8") + try: + result.check_returncode() + except subprocess.CalledProcessError: + if log_error: + logging.error( + "Command {0} failed with error:\n{1}\nand output:\n{2}".format( + " ".join(command), result.stderr, result.stdout + ) + ) + raise + + return result + + +def random_alphanumeric(size=16): + """Generate a random alphanumeric string.""" + return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(size)) + + +@retry(wait_exponential_multiplier=500, wait_exponential_max=5000, stop_max_attempt_number=5) +def retrieve_cfn_outputs(stack_name, region): + """Retrieve CloudFormation Stack Outputs from a given stack.""" + logging.debug("Retrieving stack outputs for stack {}".format(stack_name)) + try: + cfn = boto3.client("cloudformation", region_name=region) + stack = cfn.describe_stacks(StackName=stack_name).get("Stacks")[0] + outputs = {} + for output in stack.get("Outputs", []): + outputs[output.get("OutputKey")] = output.get("OutputValue") + return outputs + except Exception as e: + logging.warning("Failed retrieving stack outputs for stack {} with exception: {}".format(stack_name, e)) + raise + + +def to_snake_case(input): + """Convert a string into its snake case representation.""" + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", input) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() + + +def create_s3_bucket(bucket_name, region): + """ + Create a new S3 bucket. + + :param bucket_name: name of the S3 bucket to create + :param region: region where the bucket is created + """ + s3_client = boto3.client("s3", region_name=region) + if region != "us-east-1": + s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": region}) + else: + s3_client.create_bucket(Bucket=bucket_name) + + +@retry(wait_exponential_multiplier=500, wait_exponential_max=5000, stop_max_attempt_number=3) +def delete_s3_bucket(bucket_name, region): + """ + Delete an S3 bucket together with all stored objects. + + :param bucket_name: name of the S3 bucket to delete + :param region: region of the bucket + """ + try: + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bucket.objects.all().delete() + bucket.delete() + except boto3.client("s3").exceptions.NoSuchBucket: + pass diff --git a/tests/integration-tests/vpc_builder.py b/tests/integration-tests/vpc_builder.py new file mode 100644 index 0000000000..869b69a4ee --- /dev/null +++ b/tests/integration-tests/vpc_builder.py @@ -0,0 +1,174 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from enum import Enum, auto +from typing import List, NamedTuple + +from troposphere import GetAtt, Output, Ref, Sub, Tags, Template +from troposphere.ec2 import ( + EIP, + VPC, + InternetGateway, + NatGateway, + Route, + RouteTable, + Subnet, + SubnetRouteTableAssociation, + VPCGatewayAttachment, +) + + +class Gateways(Enum): + """Define gateways to use for default traffic in a subnet.""" + + INTERNET_GATEWAY = auto() + NAT_GATEWAY = auto() + PROXY = auto() + + +class SubnetConfig(NamedTuple): + """Configuration of a VPC Subnet""" + + name: str = "PublicSubnet" + cidr: str = "10.0.0.0/24" + map_public_ip_on_launch: bool = True + has_nat_gateway: bool = True + default_gateway: Gateways = Gateways.INTERNET_GATEWAY + + def tags(self): + """Get the tags for the subnet""" + return Tags(Name=Sub("${AWS::StackName}-" + self.name + "_subnet"), Stack=Ref("AWS::StackId")) + + +class VPCConfig(NamedTuple): + """Configuration of a VPC""" + + name: str = "vpc" + cidr: str = "10.0.0.0/16" + enable_dns_support: bool = True + enable_dns_hostnames: bool = True + has_internet_gateway: bool = True + subnets: List[SubnetConfig] = [SubnetConfig()] + tags: Tags = Tags(Name=Ref("AWS::StackName"), Stack=Ref("AWS::StackId")) + + +class VPCTemplateBuilder: + """Build troposphere CFN templates for VPC creation.""" + + def __init__(self, vpc_config, description="VPC built by VPCBuilder"): + self.__template = Template() + self.__template.set_version("2010-09-09") + self.__template.set_description(description) + self.__vpc_config = vpc_config + + def build(self): + """Build the template.""" + self.__build_template() + return self.__template + + def __build_template(self): + vpc = self.__build_vpc() + internet_gateway = self.__build_internet_gateway(vpc) + nat_gateway = None + subnet_refs = [] + for subnet in self.__vpc_config.subnets: + subnet_ref = self.__build_subnet(subnet, vpc) + subnet_refs.append(subnet_ref) + if subnet.has_nat_gateway: + nat_gateway = self.__build_nat_gateway(subnet, subnet_ref) + + for subnet, subnet_ref in zip(self.__vpc_config.subnets, subnet_refs): + self.__build_route_table(subnet, subnet_ref, vpc, internet_gateway, nat_gateway) + + def __build_vpc(self): + vpc_config = self.__vpc_config + vpc = self.__template.add_resource( + VPC( + vpc_config.name, + CidrBlock=vpc_config.cidr, + EnableDnsSupport=vpc_config.enable_dns_support, + EnableDnsHostnames=vpc_config.enable_dns_hostnames, + Tags=vpc_config.tags, + ) + ) + self.__template.add_output(Output("VpcId", Value=Ref(vpc), Description="VPC Id")) + return vpc + + def __build_internet_gateway(self, vpc: VPC): + internet_gateway = self.__template.add_resource( + InternetGateway("InternetGateway", Tags=Tags(Name=Ref("AWS::StackName"), Stack=Ref("AWS::StackId"))) + ) + self.__template.add_resource( + VPCGatewayAttachment("VPCGatewayAttachment", VpcId=Ref(vpc), InternetGatewayId=Ref(internet_gateway)) + ) + return internet_gateway + + def __build_subnet(self, subnet_config: SubnetConfig, vpc: VPC): + subnet = self.__template.add_resource( + Subnet( + subnet_config.name, + CidrBlock=subnet_config.cidr, + VpcId=Ref(vpc), + MapPublicIpOnLaunch=subnet_config.map_public_ip_on_launch, + Tags=subnet_config.tags(), + ) + ) + self.__template.add_output(Output(subnet_config.name + "Id", Value=Ref(subnet))) + return subnet + + def __build_nat_gateway(self, subnet_config: SubnetConfig, subnet_ref: Subnet): + nat_eip = self.__template.add_resource(EIP("NatEIP" + subnet_config.name, Domain="vpc")) + return self.__template.add_resource( + NatGateway( + "NatGateway" + subnet_config.name, + AllocationId=GetAtt(nat_eip, "AllocationId"), + SubnetId=Ref(subnet_ref), + ) + ) + + def __build_route_table( + self, + subnet_config: SubnetConfig, + subnet_ref: Subnet, + vpc: VPC, + internet_gateway: InternetGateway, + nat_gateway: NatGateway, + ): + route_table = self.__template.add_resource( + RouteTable( + "RouteTable" + subnet_config.name, + VpcId=Ref(vpc), + Tags=Tags(Name=Sub("${AWS::StackName}_route_table_" + subnet_config.name), Stack=Ref("AWS::StackId")), + ) + ) + self.__template.add_resource( + SubnetRouteTableAssociation( + "RouteAssociation" + subnet_config.name, SubnetId=Ref(subnet_ref), RouteTableId=Ref(route_table) + ) + ) + if subnet_config.default_gateway == Gateways.INTERNET_GATEWAY: + self.__template.add_resource( + Route( + "DefaultRoute" + subnet_config.name, + RouteTableId=Ref(route_table), + DestinationCidrBlock="0.0.0.0/0", + GatewayId=Ref(internet_gateway), + ) + ) + elif subnet_config.default_gateway == Gateways.NAT_GATEWAY: + self.__template.add_resource( + Route( + "NatRoute" + subnet_config.name, + RouteTableId=Ref(route_table), + DestinationCidrBlock="0.0.0.0/0", + NatGatewayId=Ref(nat_gateway), + ) + ) diff --git a/tests/parallelcluster-release-check.py b/tests/parallelcluster-release-check.py index abcb38adb3..539d4089c3 100644 --- a/tests/parallelcluster-release-check.py +++ b/tests/parallelcluster-release-check.py @@ -26,7 +26,6 @@ # (value does not matter). That subnet will be used as the launch # target for the cluster. -import argparse import datetime import errno import os @@ -38,9 +37,13 @@ import threading import time from builtins import exit +from collections import namedtuple +import argparse import boto3 + import process_helper as prochelp +from hamcrest import * class ReleaseCheckException(Exception): @@ -50,6 +53,23 @@ class ReleaseCheckException(Exception): # # configuration # +ClusterConfig = namedtuple( + "ClusterConfig", + [ + "config_file", + "stack_name", + "region", + "distro", + "instance_type", + "scheduler", + "username", + "key_path", + "key_name", + "master_node", + "scaledown_idletime", + ], +) + username_map = { "alinux": "ec2-user", "centos6": "centos", @@ -58,6 +78,25 @@ class ReleaseCheckException(Exception): "ubuntu1604": "ubuntu", } +# commands used to retrieve the number of compute nodes in each scheduler +get_compute_nodes_command_map = { + "sge": '/bin/bash --login -c "qhost | grep -o ip- | wc -l"', + "slurm": '/bin/bash --login -c "sinfo --Node --noheader | grep compute | wc -l"', + "torque": '/bin/bash --login -c "echo $(( $(/opt/torque/bin/pbsnodes -l all | wc -l) - 1))"', +} + +# default ssh options +ssh_config_options = [ + "-o {0}".format(option) + for option in [ + "StrictHostKeyChecking=no", + "BatchMode=yes", + "ConnectTimeout=60", + "ServerAliveCountMax=5", + "ServerAliveInterval=30", + ] +] + # # global variables (sigh) # @@ -90,78 +129,346 @@ def _double_writeln(fileo, message): fileo.write(message + "\n") -# Helper method to get the name of the autoscaling group -def check_asg_capacity(stack_name, region, out_f): - asg_conn = boto3.client("autoscaling", region_name=region) - iter = 0 - capacity = -1 - while iter < 24 and capacity != 0: - try: - r = asg_conn.describe_tags(Filters=[{"Name": "value", "Values": [stack_name]}]) - asg_name = r.get("Tags")[0].get("ResourceId") - response = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) - capacity = response["AutoScalingGroups"][0]["DesiredCapacity"] - iter += 1 - time.sleep(10) - except Exception as e: - _double_writeln(out_f, "check_asg_capacity failed with %s exception: %s" % (type(e), e)) - raise +def _get_attached_compute_nodes(cluster_config): + """ + Returns the number of compute nodes attached to the scheduler. + Args: + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + + Returns: + number_of_nodes: number of available compute nodes. + """ + output = _exec_ssh_command( + command=get_compute_nodes_command_map[cluster_config.scheduler], + username=cluster_config.username, + host=cluster_config.master_node, + key_path=cluster_config.key_path, + ) + # get last line of the output containing the number of compute nodes + return int(output.split()[-1]) + + +def _get_desired_asg_capacity(cluster_config): + """ + Retrieves the desired capacity of the autoscaling group for a specific cluster. + Args: + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + + Returns: + asg_capacity: the desired capacity of the autoscaling group. + """ + asg_conn = boto3.client("autoscaling", region_name=cluster_config.region) + tags = asg_conn.describe_tags(Filters=[{"Name": "value", "Values": [cluster_config.stack_name]}]) + asg_name = tags.get("Tags")[0].get("ResourceId") + response = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) + return response["AutoScalingGroups"][0]["DesiredCapacity"] + + +def _exec_ssh_command(command, host, username, key_path, stdout=sub.PIPE, stderr=sub.STDOUT): + """ + Executes an ssh command on a remote host. + Args: + command: command to execute. + host: host where the command is executed. + username: username used to ssh into the host. + key_path: key used to ssh into the host. + stdout: stdout redirection. Defaults to sub.PIPE. + stderr: stderr redirection. Defaults to sub.STDOUT. + + Returns: + the stdout for the executed command. + """ + ssh_params = list(ssh_config_options) + if key_path: + ssh_params.extend(["-i", key_path]) + + return prochelp.exec_command( + ["ssh", "-n"] + ssh_params + ["%s@%s" % (username, host), command], + stdout=stdout, + stderr=stderr, + universal_newlines=True, + ) - _double_writeln(out_f, "ASG Capacity was %s after %s second(s)" % (capacity, 10 * iter)) - if capacity != 0: - raise ReleaseCheckException("Autoscaling group's desired capacity was not zero. Capacity was %s" % capacity) +def _watch_compute_nodes_allocation(duration, frequency, cluster_config): + """ + Periodically watches the number of compute nodes in the cluster. + The function returns after duration or when the compute nodes scaled down to 0. + Args: + duration: duration in seconds of the periodical check. + frequency: polling interval in seconds. + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + + Returns: + (asg_capacity_time_series, compute_nodes_time_series, timestamps): three lists describing + the variation over time in the number of compute nodes and the timestamp when these fluctuations occurred. + asg_capacity_time_series describes the variation in the desired asg capacity. compute_nodes_time_series + describes the variation in the number of compute nodes seen by the scheduler. timestamps describes the + time since epoch when the variations occurred. + """ + asg_capacity_time_series = [] + compute_nodes_time_series = [] + timestamps = [] + + timeout = time.time() + duration + while time.time() < timeout: + compute_nodes = _get_attached_compute_nodes(cluster_config) + asg_capacity = _get_desired_asg_capacity(cluster_config) + timestamp = time.time() + + # add values only if there is a transition. + if ( + len(asg_capacity_time_series) == 0 + or asg_capacity_time_series[-1] != asg_capacity + or compute_nodes_time_series[-1] != compute_nodes + ): + asg_capacity_time_series.append(asg_capacity) + compute_nodes_time_series.append(compute_nodes) + timestamps.append(timestamp) + + # break loop before timeout only when compute nodes are scaled down to 0. + if asg_capacity_time_series[-1] == 0 and compute_nodes_time_series[-1] == 0: + if max(asg_capacity_time_series) > 0 and max(compute_nodes_time_series) > 0: + break + time.sleep(frequency) + + return asg_capacity_time_series, compute_nodes_time_series, timestamps + + +def _execute_test_jobs_on_cluster(cluster_config, log_file): + """ + Executes test jobs defined in cluster-check.sh on a given cluster. + Args: + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + log_file: file where to write logs. + """ + ssh_params = list(ssh_config_options) + if cluster_config.key_path: + ssh_params.extend(["-i", cluster_config.key_path]) + + prochelp.exec_command( + ["scp"] + + ssh_params + + [ + os.path.join(_dirname(), "cluster-check.sh"), + "%s@%s:." % (cluster_config.username, cluster_config.master_node), + ], + stdout=log_file, + stderr=sub.STDOUT, + universal_newlines=True, + ) + _exec_ssh_command( + command="/bin/bash --login cluster-check.sh submit %s" % cluster_config.scheduler, + username=cluster_config.username, + host=cluster_config.master_node, + key_path=cluster_config.key_path, + stdout=log_file, + ) -# -# run a single test, possibly in parallel -# -def run_test(region, distro, scheduler, instance_type, key_name, extra_args): - scaledown_idletime = 2 - testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler, instance_type.replace(".", ""), _timestamp) - test_filename = "%s-config.cfg" % testname - key_path = extra_args["key_path"] + +def _get_master_ip(cluster_config_file, cluster_name, log_file): + """ + Retrieves the ip of the master node for a given cluster. + Args: + cluster_config_file: file containing the config of the cluster. + cluster_name: name of the cluster. + log_file: file where to write logs. + + Returns: + master_ip: the ip of the master node. + """ + master_ip = "" + # get the master ip, which means grepping through pcluster status output + dump = prochelp.exec_command( + ["pcluster", "status", "--config", cluster_config_file, cluster_name], + stderr=sub.STDOUT, + universal_newlines=True, + ) + dump_array = dump.splitlines() + for line in dump_array: + m = re.search("MasterPublicIP: (.+)$", line) + if m: + master_ip = m.group(1) + break + + # Check master ip was correctly retrieved + if master_ip == "": + _double_writeln( + log_file, "!! %s: Master IP not found. This usually occurs when cluster creation failed." % cluster_name + ) + raise ReleaseCheckException("--> %s: Master IP not found!" % cluster_name) + _double_writeln(log_file, "--> %s Master IP: %s" % (cluster_name, master_ip)) + + return master_ip + + +def _write_pcluster_config(cluster_config, extra_args): + """ + Creates a file containing the config needed by pcluster to spin up the cluster. + Args: + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + extra_args: extra arguments passed to the test function. + """ custom_cookbook = extra_args["custom_cookbook_url"] custom_node = extra_args["custom_node_url"] custom_template = extra_args["custom_template_url"] - print("--> %s: Starting" % (testname)) - - file = open(test_filename, "w") - file.write("[aws]\n") - file.write("aws_region_name = %s\n" % region) - file.write("[cluster default]\n") - file.write("vpc_settings = public\n") - file.write("key_name = %s\n" % key_name) - file.write("base_os = %s\n" % distro) - file.write("master_instance_type = %s\n" % instance_type) - file.write("compute_instance_type = %s\n" % instance_type) - file.write("initial_queue_size = 1\n") - file.write("maintain_initial_size = false\n") - file.write("scheduler = %s\n" % (scheduler)) - file.write("scaling_settings = custom\n") - if custom_template: - file.write("template_url = %s\n" % custom_template) - if custom_cookbook: - file.write("custom_chef_cookbook = %s\n" % custom_cookbook) - if custom_node: - file.write('extra_json = { "cluster" : { "custom_node_package" : "%s" } }\n' % custom_node) - file.write("[vpc public]\n") - file.write("master_subnet_id = %s\n" % (setup[region]["subnet"])) - file.write("vpc_id = %s\n" % (setup[region]["vpc"])) - file.write("[global]\n") - file.write("cluster_template = default\n") - file.write("[scaling custom]\n") - file.write("scaledown_idletime = %s\n" % scaledown_idletime) - file.close() + with open(cluster_config.config_file, "w") as file: + file.write("[aws]\n") + file.write("aws_region_name = %s\n" % cluster_config.region) + file.write("[cluster default]\n") + file.write("vpc_settings = public\n") + file.write("key_name = %s\n" % cluster_config.key_name) + file.write("base_os = %s\n" % cluster_config.distro) + file.write("master_instance_type = %s\n" % cluster_config.instance_type) + file.write("compute_instance_type = %s\n" % cluster_config.instance_type) + file.write("initial_queue_size = 1\n") + file.write("maintain_initial_size = false\n") + file.write("scheduler = %s\n" % cluster_config.scheduler) + file.write("scaling_settings = custom\n") + if custom_template: + file.write("template_url = %s\n" % custom_template) + if custom_cookbook: + file.write("custom_chef_cookbook = %s\n" % custom_cookbook) + if custom_node: + file.write('extra_json = { "cluster" : { "custom_node_package" : "%s" } }\n' % custom_node) + file.write("[vpc public]\n") + file.write("master_subnet_id = %s\n" % (setup[cluster_config.region]["subnet"])) + file.write("vpc_id = %s\n" % (setup[cluster_config.region]["vpc"])) + file.write("[global]\n") + file.write("cluster_template = default\n") + file.write("[scaling custom]\n") + file.write("scaledown_idletime = %s\n" % cluster_config.scaledown_idletime) + + +def _assert_scaling_works( + asg_capacity_time_series, compute_nodes_time_series, expected_asg_capacity, expected_compute_nodes +): + """ + Verifies that cluster scaling-up and scaling-down features work correctly. + Args: + asg_capacity_time_series: list describing the fluctuations over time in the asg capacity + compute_nodes_time_series: list describing the fluctuations over time in the compute nodes + expected_asg_capacity: pair containing the expected asg capacity (min_asg_capacity, max_asg_capacity) + expected_compute_nodes: pair containing the expected compute nodes (min_compute_nodes, max_compute_nodes) + """ + assert_that(asg_capacity_time_series, is_not(empty()), "asg_capacity_time_series cannot be empty") + assert_that(compute_nodes_time_series, is_not(empty()), "compute_nodes_time_series cannot be empty") + + expected_asg_capacity_min, expected_asg_capacity_max = expected_asg_capacity + expected_compute_nodes_min, expected_compute_nodes_max = expected_compute_nodes + actual_asg_capacity_max = max(asg_capacity_time_series) + actual_asg_capacity_min = min(asg_capacity_time_series[asg_capacity_time_series.index(actual_asg_capacity_max) :]) + actual_compute_nodes_max = max(compute_nodes_time_series) + actual_compute_nodes_min = min( + compute_nodes_time_series[compute_nodes_time_series.index(actual_compute_nodes_max) :] + ) + assert_that( + actual_asg_capacity_min, + is_(equal_to(expected_asg_capacity_min)), + "actual asg min capacity does not match the expected one", + ) + assert_that( + actual_asg_capacity_max, + is_(equal_to(expected_asg_capacity_max)), + "actual asg max capacity does not match the expected one", + ) + assert_that( + actual_compute_nodes_min, + is_(equal_to(expected_compute_nodes_min)), + "actual number of min compute nodes does not match the expected one", + ) + assert_that( + actual_compute_nodes_max, + is_(equal_to(expected_compute_nodes_max)), + "actual number of max compute nodes does not match the expected one", + ) - out_f = open("%s-out.txt" % testname, "w", 0) - master_ip = "" - username = username_map[distro] +def _assert_test_jobs_completed(cluster_config, max_jobs_exec_time, log_file): + """ + Verifies that test jobs started by cluster-check.sh script were successfully executed + and in a timely manner. + In order to do this the function checks that some files (jobN.done), which denote the fact + that a job has been correctly executed, are present in the shared cluster file-system. + Additionally, the function uses the timestamp contained in those files, that indicates + the end time of each job, to verify that all jobs were executed within the max expected time. + Args: + cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. + max_jobs_exec_time: max execution time given to the jobs to complete + log_file: file where to write logs. + + """ + try: + _exec_ssh_command( + command="test -f job1.done -a -f job2.done -a -f job3.done", + username=cluster_config.username, + host=cluster_config.master_node, + key_path=cluster_config.key_path, + stdout=log_file, + ) + output = _exec_ssh_command( + command="cat jobs_start_time", + username=cluster_config.username, + host=cluster_config.master_node, + key_path=cluster_config.key_path, + ) + jobs_start_time = int(output.split()[-1]) + output = _exec_ssh_command( + command="cat job1.done job2.done job3.done | sort -n | tail -1", + username=cluster_config.username, + host=cluster_config.master_node, + key_path=cluster_config.key_path, + ) + jobs_completion_time = int(output.split()[-1]) + jobs_execution_time = jobs_completion_time - jobs_start_time + _double_writeln(log_file, "jobs execution time in seconds: %d" % jobs_execution_time) + assert_that( + jobs_execution_time, + is_(less_than(max_jobs_exec_time)), + "jobs did not complete the execution in the expected time", + ) + except sub.CalledProcessError: + raise AssertionError("Test jobs did not complete in time") + + +# +# run a single test, possibly in parallel +# +def run_test( + region, distro, scheduler, instance_type, key_name, expected_asg_capacity, expected_compute_nodes, extra_args +): _create_interrupted = False _create_done = False + testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler, instance_type.replace(".", ""), _timestamp) + test_filename = "%s-config.cfg" % testname + out_f = open("%s-out.txt" % testname, "w", 0) + # Test jobs should take at most 9 minutes to be executed. + # These guarantees that the jobs are executed in parallel. + max_jobs_exec_time = 9 * 60 + try: + _double_writeln(out_f, "--> %s: Starting" % testname) + + cluster_config = ClusterConfig( + config_file=test_filename, + stack_name="parallelcluster-" + testname, + region=region, + distro=distro, + instance_type=instance_type, + scheduler=scheduler, + username=username_map[distro], + key_path=extra_args["key_path"], + key_name=key_name, + master_node="", + scaledown_idletime=4, + ) + + _write_pcluster_config(cluster_config=cluster_config, extra_args=extra_args) + _double_writeln(out_f, "--> %s: Created pcluster config file %s" % (testname, test_filename)) + # build the cluster + _double_writeln(out_f, "--> %s: Creating the cluster" % testname) prochelp.exec_command( ["pcluster", "create", "--config", test_filename, testname], stdout=out_f, @@ -169,59 +476,49 @@ def run_test(region, distro, scheduler, instance_type, key_name, extra_args): universal_newlines=True, ) _create_done = True - # get the master ip, which means grepping through pcluster status gorp - dump = prochelp.exec_command( - ["pcluster", "status", "--config", test_filename, testname], stderr=sub.STDOUT, universal_newlines=True - ) - dump_array = dump.splitlines() - for line in dump_array: - m = re.search("MasterPublicIP: (.+)$", line) - if m: - master_ip = m.group(1) - break - if master_ip == "": - _double_writeln(out_f, "!! %s: Master IP not found; exiting !!" % (testname)) - raise ReleaseCheckException("--> %s: Master IP not found!" % testname) - _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) - - # run test on the cluster... - ssh_params = ["-o", "StrictHostKeyChecking=no"] - ssh_params += ["-o", "BatchMode=yes"] - # ssh_params += ['-o', 'ConnectionAttempts=30'] - ssh_params += ["-o", "ConnectTimeout=60"] - ssh_params += ["-o", "ServerAliveCountMax=5"] - ssh_params += ["-o", "ServerAliveInterval=30"] - if key_path: - ssh_params.extend(["-i", key_path]) + _double_writeln(out_f, "--> %s: Cluster created successfully" % testname) - prochelp.exec_command( - ["scp"] + ssh_params + [os.path.join(_dirname(), "cluster-check.sh"), "%s@%s:." % (username, master_ip)], - stdout=out_f, - stderr=sub.STDOUT, - universal_newlines=True, - ) - prochelp.exec_command( - ["ssh", "-n"] - + ssh_params - + ["%s@%s" % (username, master_ip), "/bin/bash --login cluster-check.sh submit %s" % scheduler], - stdout=out_f, - stderr=sub.STDOUT, - universal_newlines=True, + cluster_config = cluster_config._replace( + master_node=_get_master_ip(cluster_config_file=test_filename, cluster_name=testname, log_file=out_f) ) - # Sleep for scaledown_idletime to give time for the instances to scale down - time.sleep(60 * scaledown_idletime) + _double_writeln(out_f, "--> %s: Executing test jobs on cluster." % testname) + _execute_test_jobs_on_cluster(cluster_config=cluster_config, log_file=out_f) + _double_writeln(out_f, "--> %s: Test jobs successfully started" % testname) - check_asg_capacity("parallelcluster-" + testname, region, out_f) + _double_writeln(out_f, "--> %s: Monitoring asg capacity and compute nodes" % testname) + additional_watching_time = 5 * 60 + asg_capacity_time_series, compute_nodes_time_series, timestamps = _watch_compute_nodes_allocation( + duration=max_jobs_exec_time + cluster_config.scaledown_idletime * 60 + additional_watching_time, + frequency=20, + cluster_config=cluster_config, + ) + _double_writeln( + out_f, + "--> %s: Monitoring completed: %s, %s, %s" + % ( + testname, + "asg_capacity_time_series [" + " ".join(map(str, asg_capacity_time_series)) + "]", + "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]", + "timestamps [" + " ".join(map(str, timestamps)) + "]", + ), + ) - prochelp.exec_command( - ["ssh", "-n"] - + ssh_params - + ["%s@%s" % (username, master_ip), "/bin/bash --login cluster-check.sh scaledown_check %s" % scheduler], - stdout=out_f, - stderr=sub.STDOUT, - universal_newlines=True, + _double_writeln(out_f, "--> %s: Verifying test jobs completed successfully" % testname) + # jobs need to complete in 9 mins in order to verify parallelism + _assert_test_jobs_completed( + cluster_config=cluster_config, max_jobs_exec_time=max_jobs_exec_time, log_file=out_f + ) + _double_writeln(out_f, "--> %s: Test jobs completed successfully" % testname) + + _double_writeln(out_f, "--> %s: Verifying auto-scaling worked correctly" % testname) + _assert_scaling_works( + asg_capacity_time_series=asg_capacity_time_series, + compute_nodes_time_series=compute_nodes_time_series, + expected_asg_capacity=expected_asg_capacity, + expected_compute_nodes=expected_compute_nodes, ) + _double_writeln(out_f, "--> %s: Autoscaling worked as expected" % testname) _double_writeln(out_f, "SUCCESS: %s!!" % testname) open("%s.success" % testname, "w").close() @@ -232,11 +529,16 @@ def run_test(region, distro, scheduler, instance_type, key_name, extra_args): _double_writeln(out_f, "!! ABORTED: %s!!" % (testname)) open("%s.aborted" % testname, "w").close() raise exc + except AssertionError as err: + _double_writeln(out_f, "--> %s: Test assertion failed: %s" % (testname, err.message)) + _double_writeln(out_f, "!! FAILURE: %s!!" % testname) + open("%s.failed" % testname, "w").close() + raise err except Exception as exc: if not _create_done: _create_interrupted = True - _double_writeln(out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) - _double_writeln(out_f, "!! FAILURE: %s!!" % (testname)) + _double_writeln(out_f, "--> %s: Unexpected exception %s: %s" % (testname, str(type(exc)), str(exc))) + _double_writeln(out_f, "!! FAILURE: %s!!" % testname) open("%s.failed" % testname, "w").close() raise exc finally: @@ -289,8 +591,8 @@ def run_test(region, distro, scheduler, instance_type, key_name, extra_args): pass except Exception as exc: out_f.write("Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) + _double_writeln(out_f, "--> %s: Finished" % testname) out_f.close() - print("--> %s: Finished" % (testname)) # @@ -315,6 +617,8 @@ def test_runner(region, q, key_name, extra_args): scheduler=item["scheduler"], instance_type=item["instance_type"], key_name=key_name, + expected_asg_capacity=item["expected_asg_capacity"], + expected_compute_nodes=item["expected_compute_nodes"], extra_args=extra_args, ) retval = 0 @@ -423,10 +727,14 @@ def _main_child(): "custom_node_url": None, "custom_cookbook_url": None, "custom_template_url": None, + "expected_asg_capacity_min": 0, + "expected_asg_capacity_max": 3, + "expected_compute_nodes_min": 0, + "expected_compute_nodes_max": 3, } parser = argparse.ArgumentParser(description="Test runner for AWS ParallelCluster") - parser.add_argument("--parallelism", help="Number of tests per region to run in parallel", type=int, default=3) + parser.add_argument("--parallelism", help="Number of tests per region to run in parallel", type=int) parser.add_argument("--regions", help="Comma separated list of regions to test", type=str) parser.add_argument("--distros", help="Comma separated list of distributions to test", type=str) parser.add_argument("--schedulers", help="Comma separated list of schedulers to test", type=str) @@ -442,17 +750,31 @@ def _main_child(): "--custom-cookbook-url", help="S3 URL to a custom aws-parallelcluster-cookbook package", type=str ) parser.add_argument( - "--custom-template-url", help="S3 URL to a custom AWS ParallelCluster CloudFormation template", type=str + "--custom-template-url", help="S3 URL to a custom aws-parallelcluster CloudFormation template", type=str + ) + parser.add_argument( + "--expected-asg-capacity-min", help="Expected number of nodes in the asg after scale-down", type=int + ) + parser.add_argument( + "--expected-asg-capacity-max", help="Expected number of nodes in the asg after scale-up", type=int + ) + parser.add_argument( + "--expected-compute-nodes-min", help="Expected number of nodes in the scheduler after scale-down", type=int + ) + parser.add_argument( + "--expected-compute-nodes-max", help="Expected number of nodes in the scheduler after scale-up", type=int ) for key, value in vars(parser.parse_args()).iteritems(): - if not value == None: + if value is not None: config[key] = value region_list = config["regions"].split(",") distro_list = config["distros"].split(",") scheduler_list = config["schedulers"].split(",") instance_type_list = config["instance_types"].split(",") + expected_asg_capacity = (config["expected_asg_capacity_min"], config["expected_asg_capacity_max"]) + expected_compute_nodes = (config["expected_compute_nodes_min"], config["expected_compute_nodes_max"]) print("==> Regions: %s" % (", ".join(region_list))) print("==> Instance Types: %s" % (", ".join(instance_type_list))) @@ -460,6 +782,8 @@ def _main_child(): print("==> Schedulers: %s" % (", ".join(scheduler_list))) print("==> Parallelism: %d" % (config["parallelism"])) print("==> Key Pair: %s" % (config["key_name"])) + print("==> Expected asg capacity: min=%d, max=%d " % expected_asg_capacity) + print("==> Expected compute nodes: min=%d, max=%d " % expected_compute_nodes) # Optional params if config["key_path"]: @@ -469,7 +793,7 @@ def _main_child(): if config["custom_node_url"]: print("==> Custom aws-parallelcluster-node URL: %s" % (config["custom_node_url"])) if config["custom_template_url"]: - print("==> Custom AWS ParallelCluster template URL: %s" % (config["custom_template_url"])) + print("==> Custom aws-parallelcluster template URL: %s" % (config["custom_template_url"])) # Populate subnet / vpc data for all regions we're going to test. for region in region_list: @@ -497,7 +821,13 @@ def _main_child(): for distro in distro_list: for scheduler in scheduler_list: for instance in instance_type_list: - work_item = {"distro": distro, "scheduler": scheduler, "instance_type": instance} + work_item = { + "distro": distro, + "scheduler": scheduler, + "instance_type": instance, + "expected_asg_capacity": expected_asg_capacity, + "expected_compute_nodes": expected_compute_nodes, + } work_queues[region].put(work_item) # start all the workers diff --git a/tests/raid-test.py b/tests/raid-test.py index 3b9b8f1760..3acdbf9d04 100644 --- a/tests/raid-test.py +++ b/tests/raid-test.py @@ -1,4 +1,3 @@ -import argparse import datetime import os import Queue @@ -12,7 +11,9 @@ import time from builtins import exit +import argparse import boto3 + import process_helper as prochelp UNSUPPORTED_REGIONS = set(["ap-northeast-3", "eu-west-3"]) diff --git a/util/batch-instance-whitelist.py b/util/batch-instance-whitelist.py index 5b64ed264f..e4aec9713e 100755 --- a/util/batch-instance-whitelist.py +++ b/util/batch-instance-whitelist.py @@ -18,11 +18,11 @@ # # usage: ./batch-instance-whitelist.py --regions <'all' or comma seperated list> --bucket -import argparse import json import re import sys +import argparse import boto3 from botocore.exceptions import ClientError diff --git a/util/bump-version.sh b/util/bump-version.sh new file mode 100755 index 0000000000..af4210ae08 --- /dev/null +++ b/util/bump-version.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -ex + +if [ -z "$1" ]; then + echo "New version not specified. Usage: bump-version.sh NEW_VERSION" + exit 1 +fi + +NEW_VERSION=$1 +CURRENT_VERSION=$(sed -ne "s/^VERSION = \"\(.*\)\"/\1/p" cli/setup.py) + +sed -i "s/aws-parallelcluster-$CURRENT_VERSION/aws-parallelcluster-$NEW_VERSION/g" cloudformation/aws-parallelcluster.cfn.json +sed -i "s/\"parallelcluster\": \"$CURRENT_VERSION\"/\"parallelcluster\": \"$NEW_VERSION\"/g" cloudformation/aws-parallelcluster.cfn.json +sed -i "s/aws-parallelcluster-cookbook-$CURRENT_VERSION/aws-parallelcluster-cookbook-$NEW_VERSION/g" cloudformation/aws-parallelcluster.cfn.json +sed -i "s/version = '${CURRENT_VERSION%.*}'/version = '${NEW_VERSION%.*}'/g" docs/conf.py +sed -i "s/release = '$CURRENT_VERSION'/release = '$NEW_VERSION'/g" docs/conf.py +sed -i "s/VERSION = \"$CURRENT_VERSION\"/VERSION = \"$NEW_VERSION\"/g" cli/setup.py diff --git a/util/generate-ami-list.py b/util/generate-ami-list.py index c06485eaf1..e444ce2951 100644 --- a/util/generate-ami-list.py +++ b/util/generate-ami-list.py @@ -18,11 +18,11 @@ # # usage: ./generate-ami-list.py --version --date -import argparse import json import sys from collections import OrderedDict +import argparse import boto3 from botocore.exceptions import ClientError diff --git a/util/generate-fsx-substack.py b/util/generate-fsx-substack.py new file mode 100644 index 0000000000..9e7b50f482 --- /dev/null +++ b/util/generate-fsx-substack.py @@ -0,0 +1,87 @@ +import argparse + +from troposphere import And, Condition, Equals, If, Not, NoValue, Output, Parameter, Ref, Select, Template +from troposphere.fsx import FileSystem, LustreConfiguration + + +def main(args): + t = Template() + + # ================= Parameters ================= + # 0 1 2 3 4 5 6 7 + # [shared_dir,fsx_fs_id,storage_capacity,fsx_kms_key_id,imported_file_chunk_size,export_path,import_path,weekly_maintenance_start_time] + fsx_options = t.add_parameter( + Parameter( + "FSXOptions", + Type="CommaDelimitedList", + Description="Comma separated list of fsx related options, 8 parameters in total, [shared_dir,fsx_fs_id,storage_capacity,fsx_kms_key_id,imported_file_chunk_size,export_path,import_path,weekly_maintenance_start_time]", + ) + ) + + compute_security_group = t.add_parameter( + Parameter("ComputeSecurityGroup", Type="String", Description="SecurityGroup for FSx filesystem") + ) + + subnet_id = t.add_parameter(Parameter("SubnetId", Type="String", Description="SubnetId for FSx filesystem")) + + # ================= Conditions ================= + create_fsx = t.add_condition( + "CreateFSX", + And(Not(Equals(Select(str(0), Ref(fsx_options)), "NONE")), Equals(Select(str(1), Ref(fsx_options)), "NONE")), + ) + + use_storage_capacity = t.add_condition("UseStorageCap", Not(Equals(Select(str(2), Ref(fsx_options)), "NONE"))) + use_fsx_kms_key = t.add_condition("UseFSXKMSKey", Not(Equals(Select(str(3), Ref(fsx_options)), "NONE"))) + use_imported_file_chunk_size = t.add_condition( + "UseImportedFileChunkSize", Not(Equals(Select(str(4), Ref(fsx_options)), "NONE")) + ) + use_export_path = t.add_condition("UseExportPath", Not(Equals(Select(str(5), Ref(fsx_options)), "NONE"))) + use_import_path = t.add_condition("UseImportPath", Not(Equals(Select(str(6), Ref(fsx_options)), "NONE"))) + use_weekly_mainenance_start_time = t.add_condition( + "UseWeeklyMaintenanceStartTime", Not(Equals(Select(str(7), Ref(fsx_options)), "NONE")) + ) + + # ================= Resources ================= + fs = t.add_resource( + FileSystem( + "FileSystem", + FileSystemType="LUSTRE", + SubnetIds=[Ref(subnet_id)], + SecurityGroupIds=[Ref(compute_security_group)], + KmsKeyId=If(use_fsx_kms_key, Select(str(3), Ref(fsx_options)), NoValue), + StorageCapacity=If(use_storage_capacity, Select(str(2), Ref(fsx_options)), NoValue), + LustreConfiguration=LustreConfiguration( + ImportedFileChunkSize=If(use_imported_file_chunk_size, Select(str(4), Ref(fsx_options)), NoValue), + ExportPath=If(use_export_path, Select(str(5), Ref(fsx_options)), NoValue), + ImportPath=If(use_import_path, Select(str(6), Ref(fsx_options)), NoValue), + WeeklyMaintenanceStartTime=If( + use_weekly_mainenance_start_time, Select(str(7), Ref(fsx_options)), NoValue + ), + ), + Condition=create_fsx, + ) + ) + + # ================= Outputs ================= + t.add_output( + Output( + "FileSystemId", + Description="ID of the FileSystem", + Value=If(create_fsx, Ref(fs), Select("1", Ref(fsx_options))), + ) + ) + + # Specify output file path + json_file_path = args.target_path + output_file = open(json_file_path, "w") + output_file.write(t.to_json()) + output_file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Take in generator related parameters") + parser.add_argument( + "--target-path", type=str, help="The target path for generated substack template", required=True + ) + args = parser.parse_args() + main(args) diff --git a/util/get-ami-list.py b/util/get-ami-list.py index 96652ccab8..0893311fa1 100644 --- a/util/get-ami-list.py +++ b/util/get-ami-list.py @@ -18,12 +18,13 @@ # # usage: ./get-ami-list.py -import argparse import os import re import shutil import tempfile +import argparse + from git import Repo repo_url = "https://github.com/aws/aws-parallelcluster.git" diff --git a/util/upload-cfn-templates.py b/util/upload-cfn-templates.py index 0ed4b412ae..af4ac81640 100644 --- a/util/upload-cfn-templates.py +++ b/util/upload-cfn-templates.py @@ -1,9 +1,8 @@ -import argparse import sys -import pkg_resources - +import argparse import boto3 +import pkg_resources from botocore.exceptions import ClientError diff --git a/util/upload-instance-slot-map.py b/util/upload-instance-slot-map.py index ce865fcd29..8676f04636 100644 --- a/util/upload-instance-slot-map.py +++ b/util/upload-instance-slot-map.py @@ -18,10 +18,10 @@ # # usage: ./upload-instance-slot-map.py --partition [--instance-details ] -import argparse import json import sys +import argparse import boto3 from botocore.exceptions import ClientError diff --git a/util/uploadCLI.sh b/util/uploadCLI.sh new file mode 100644 index 0000000000..7bd5c472bb --- /dev/null +++ b/util/uploadCLI.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +_error_exit() { + echo "$1" + exit 1 +} + +_info() { + echo "INFO: $1" +} + +_help() { + local -- _cmd=$(basename "$0") + + cat < Bucket where upload the package + --srcdir Root folder of the pcluster project + --profile AWS profile name to use for the upload + (optional, default is AWS_PROFILE env variable or "default"). + --region Region to use for AWSCli commands (optional, default is "us-east-1") + -h, --help Print this help message +EOF +} + +main() { + # parse input options + while [ $# -gt 0 ] ; do + case "$1" in + --bucket) _bucket="$2"; shift;; + --bucket=*) _bucket="${1#*=}";; + --srcdir) _srcdir="$2"; shift;; + --srcdir=*) _srcdir="${1#*=}";; + --profile) _profile="$2"; shift;; + --profile=*) _profile="${1#*=}";; + --region) _region="$2"; shift;; + --region=*) _region="${1#*=}";; + -h|--help|help) _help; exit 0;; + *) _help; echo "[error] Unrecognized option '$1'"; exit 1;; + esac + shift + done + + # verify required parameters + if [ -z "${_bucket}" ]; then + _error_exit "--bucket parameter not specified" + _help; + fi + if [ -z "${_srcdir}" ]; then + _error_exit "--srcdir parameter not specified" + _help; + fi + + # initialize optional parameters + if [ -z "${AWS_PROFILE}" ] && [ -z "${_profile}" ]; then + _info "--profile parameter not specified, using 'default'" + elif [ -n "${_profile}" ]; then + _profile="--profile ${_profile}" + fi + if [ -z "${_region}" ]; then + _info "--region parameter not specified, using 'us-east-1'" + _region="us-east-1" + fi + + # check bucket or create it + aws ${_profile} s3api head-bucket --bucket "${_bucket}" --region "${_region}" + if [ $? -ne 0 ]; then + _info "Bucket ${_bucket} do not exist, trying to create it" + aws ${_profile} s3api create-bucket --bucket "${_bucket}" --region "${_region}" + if [ $? -ne 0 ]; then + _error_exit "Unable to create bucket ${_bucket}" + fi + fi + + _version=$(grep "VERSION = \"" "${_srcdir}/cli/setup.py" |awk '{print $3}'| tr -d \") + if [ -z "${_version}" ]; then + _error_exit "Unable to detect AWS ParallelCluster version, are you in the right directory?" + fi + _info "Detected version ${_version}" + + # Create archive + _cwd=$(pwd) + pushd "${_srcdir}" > /dev/null + _stashName=$(git stash create) + git archive --format tar --prefix="aws-parallelcluster-${_version}/" "${_stashName:-HEAD}" | gzip > "${_cwd}/aws-parallelcluster-${_version}.tgz" + popd > /dev/null + + # upload package + aws ${_profile} --region "${_region}" s3 cp --acl public-read aws-parallelcluster-${_version}.tgz s3://${_bucket}/cli/aws-parallelcluster-${_version}.tgz || _error_exit 'Failed to push node to S3' + + _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text) + if [ ${_bucket_region} == "None" ]; then + _bucket_region="" + else + _bucket_region=".${_bucket_region}" + fi + + echo "" + echo "Done. Add the following variable to the pcluster config file, under the [cluster ...] section" + echo "extra_json = { \"cluster\" : { \"custom_awsbatchcli_package\" : \"https://s3${_bucket_region}.amazonaws.com/${_bucket}/cli/aws-parallelcluster-${_version}.tgz\" } }" +} + +main "$@" + +# vim:syntax=sh diff --git a/util/uploadTemplate.sh b/util/uploadTemplate.sh new file mode 100644 index 0000000000..234a204dbc --- /dev/null +++ b/util/uploadTemplate.sh @@ -0,0 +1,103 @@ +#!/bin/bash +_error_exit() { + echo "$1" + exit 1 +} + +_info() { + echo "INFO: $1" +} + +_help() { + local -- _cmd=$(basename "$0") + + cat < Bucket where upload the template + --srcdir Root folder of the pcluster project + --profile AWS profile name to use for the upload + (optional, default is AWS_PROFILE env variable or "default") + --region Region to use for AWSCli commands (optional, default is "us-east-1") + -h, --help Print this help message +EOF +} + +main() { + # parse input options + while [ $# -gt 0 ] ; do + case "$1" in + --bucket) _bucket="$2"; shift;; + --bucket=*) _bucket="${1#*=}";; + --srcdir) _srcdir="$2"; shift;; + --srcdir=*) _srcdir="${1#*=}";; + --profile) _profile="$2"; shift;; + --profile=*) _profile="${1#*=}";; + --region) _region="$2"; shift;; + --region=*) _region="${1#*=}";; + -h|--help|help) _help; exit 0;; + *) _help; echo "[error] Unrecognized option '$1'"; exit 1;; + esac + shift + done + + # verify required parameters + if [ -z "${_bucket}" ]; then + _error_exit "--bucket parameter not specified" + _help; + fi + if [ -z "${_srcdir}" ]; then + _error_exit "--srcdir parameter not specified" + _help; + fi + + # initialize optional parameters + if [ -z "${AWS_PROFILE}" ] && [ -z "${_profile}" ]; then + _info "--profile parameter not specified, using 'default'" + elif [ -n "${_profile}" ]; then + _profile="--profile ${_profile}" + fi + if [ -z "${_region}" ]; then + _info "--region parameter not specified, using 'us-east-1'" + _region="us-east-1" + fi + + # check bucket or create it + aws ${_profile} s3api head-bucket --bucket "${_bucket}" --region "${_region}" + if [ $? -ne 0 ]; then + _info "Bucket ${_bucket} do not exist, trying to create it" + aws ${_profile} s3api create-bucket --bucket "${_bucket}" --region "${_region}" + if [ $? -ne 0 ]; then + _error_exit "Unable to create bucket ${_bucket}" + fi + fi + + _version=$(grep "VERSION = \"" "${_srcdir}/cli/setup.py" |awk '{print $3}'| tr -d \") + if [ -z "${_version}" ]; then + _error_exit "Unable to detect pcluster version, are you in the right directory?" + fi + _info "Detected version ${_version}" + + # upload templates + aws ${_profile} --region "${_region}" s3 cp --acl public-read ${_srcdir}/cloudformation/aws-parallelcluster.cfn.json s3://${_bucket}/template/aws-parallelcluster.cfn.${_version}.json || _error_exit 'Failed to push cloudformation template to S3' + aws ${_profile} --region "${_region}" s3 cp --acl public-read ${_srcdir}/cloudformation/batch-substack.cfn.json s3://${_bucket}/template/batch-substack.cfn.json || _error_exit 'Failed to push Batch cfn template to S3' + + _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text) + if [ ${_bucket_region} == "None" ]; then + _bucket_region="" + else + _bucket_region=".${_bucket_region}" + fi + + echo "" + echo "Done. Add the following variables to the pcluster config file, under the [cluster ...] section" + echo "template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/aws-parallelcluster.cfn.${_version}.json" + echo "custom_awsbatch_template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/batch.cfn.json" +} + +main "$@" + +# vim:syntax=sh