Feat/use jumpstart streamlined for new llm options (#192)

* chore: run terraform fmt with -recursive option * check on each PR * wip * local * latest wip * all working * formatting * modify * restore status after problematic merge * latest * formatting
uktrade · Dec 19, 2024 · e8aa255 · e8aa255
1 parent b611555
commit e8aa255
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 56 deletions.
diff --git a/infra/ecs_notebooks_notebook.tf b/infra/ecs_notebooks_notebook.tf
@@ -354,9 +354,9 @@ data "aws_iam_policy_document" "aws_vpc_endpoint_s3_notebooks" {
       "s3:ListBucket",
     ]
 
-    resources = [
+    resources = concat([
       for bucket in aws_s3_bucket.mlflow : bucket.arn
-    ]
+    ], ["arn:aws:s3:::jumpstart-cache-prod-eu-west-2"])
   }
 
   statement {
@@ -369,9 +369,9 @@ data "aws_iam_policy_document" "aws_vpc_endpoint_s3_notebooks" {
       "s3:GetObject",
     ]
 
-    resources = [
+    resources = concat([
       "arn:aws:s3:::${var.mirrors_data_bucket_name != "" ? var.mirrors_data_bucket_name : var.mirrors_bucket_name}/*",
-    ]
+    ], ["arn:aws:s3:::jumpstart-cache-prod-eu-west-2/*"])
   }
 
   statement {

diff --git a/infra/modules/sagemaker_deployment/main.tf b/infra/modules/sagemaker_deployment/main.tf
@@ -4,15 +4,26 @@ resource "aws_sagemaker_model" "sagemaker_model" {
   execution_role_arn = var.execution_role_arn
 
   primary_container {
-    image          = var.container_image
-    model_data_url = var.model_data_url
-    environment    = var.environment
+    image       = var.container_image
+    environment = var.environment_variables
+
+    model_data_source {
+      s3_data_source {
+        s3_uri           = var.uncompressed_model_uri
+        s3_data_type     = "S3Prefix"
+        compression_type = "None"
+        model_access_config {
+          accept_eula = true
+        }
+      }
+    }
   }
 
   vpc_config {
     security_group_ids = var.security_group_ids
     subnets            = var.subnets
   }
+
 }
 
 # Endpoint Configuration
@@ -51,6 +62,7 @@ resource "aws_appautoscaling_target" "autoscaling_target" {
   resource_id        = "endpoint/${aws_sagemaker_endpoint.sagemaker_endpoint.name}/variant/${var.variant_name}"
   scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
   service_namespace  = "sagemaker"
+  depends_on         = [aws_sagemaker_endpoint.sagemaker_endpoint, aws_sagemaker_endpoint_configuration.endpoint_config]
 }
 
 # Autoscaling Policy for Scaling Up
@@ -135,7 +147,7 @@ resource "aws_appautoscaling_policy" "scale_in_to_zero_based_on_backlog" {
   depends_on = [aws_appautoscaling_target.autoscaling_target]
 }
 
-resource "aws_cloudwatch_log_metric_filter" "unatuhorized_operations" {
+resource "aws_cloudwatch_log_metric_filter" "unauthorized_operations" {
   name           = "unauthorized-operations-filter"
   log_group_name = var.log_group_name
   pattern        = "{ $.errorCode = \"UnauthorizedOperation\" || $.errorCode = \"AccessDenied\" }"

diff --git a/infra/modules/sagemaker_deployment/variables.tf b/infra/modules/sagemaker_deployment/variables.tf
@@ -18,12 +18,12 @@ variable "container_image" {
   description = "Container image for the model"
 }
 
-variable "model_data_url" {
+variable "uncompressed_model_uri" {
   type        = string
   description = "S3 URL where the model data is located"
 }
 
-variable "environment" {
+variable "environment_variables" {
   type        = map(string)
   description = "Environment variables for the container"
 }
@@ -116,4 +116,4 @@ variable "log_group_name" {
   type        = string
   description = "log group name, i.e. gpt-neo-125m..."
   default     = ""
-}
+}
diff --git a/infra/modules/sagemaker_init/iam/main.tf b/infra/modules/sagemaker_init/iam/main.tf
@@ -51,17 +51,36 @@ resource "aws_iam_role" "inference_role" {
 data "aws_iam_policy_document" "sagemaker_inference_policy_document" {
   statement {
     actions = [
-      "s3:ListBucket",
-      "s3:GetObject",
       "s3:PutObject",
       "s3:DeleteObject"
     ]
     resources = [
       "arn:aws:s3:::*sagemaker*",
-      "${var.aws_s3_bucket_notebook.arn}/*"
+      "${var.aws_s3_bucket_notebook.arn}/*",
+    ]
+  }
+  statement {
+    actions = [
+      "s3:GetObject",
+    ]
+    resources = [
+      "arn:aws:s3:::*sagemaker*",
+      "${var.aws_s3_bucket_notebook.arn}/*",
+      "arn:aws:s3:::jumpstart-cache-prod-eu-west-2/*"
     ]
   }
+  statement {
+    actions = [
+      "s3:ListBucket",
+      "s3:GetBucketLocation",
+    ]
 
+    resources = [
+      "arn:aws:s3:::jumpstart-cache-prod-eu-west-2",
+      "${var.aws_s3_bucket_notebook.arn}",
+      "arn:aws:s3:::*sagemaker*"
+    ]
+  }
   statement {
     actions = [
       "SNS:Publish",
@@ -239,4 +258,3 @@ resource "aws_iam_role_policy_attachment" "attach_cloudwatch_log_invoke_policy"
   role       = aws_iam_role.lambda_execution_role.name
   policy_arn = aws_iam_policy.cloudwatch_log_invoke_policy.arn
 }
-
diff --git a/infra/sagemaker_llm_resources.tf b/infra/sagemaker_llm_resources.tf
@@ -1,25 +1,37 @@
-##################################################################################################################
-# GPT Neo 125M parameter endpoint and associated alarms and policies
-#################################################################################################################
+locals {
+  all_endpoint_names = [
+    module.gpt_neo_125_deployment.endpoint_name,
+    module.llama_3_2_1b_deployment.endpoint_name
+  ]
+}
+
+################
+# GPT Neo 125m
+###############
 
 module "gpt_neo_125_deployment" {
-  source                = "./modules/sagemaker_deployment"
-  model_name            = "gpt-neo-125m"
-  sns_success_topic_arn = module.sagemaker_output_mover.sns_success_topic_arn
-  execution_role_arn    = module.iam.inference_role
-  container_image       = var.hugging_face_model_image
-  model_data_url        = "${var.sagemaker_models_folder}/gpt-neo-125m.tar.gz"
-  environment = {
-    "HF_MODEL_ID"      = "/opt/ml/model/"
-    "SM_NUM_GPUS"      = 1
-    "MAX_INPUT_LENGTH" = 1024
-    "MAX_TOTAL_TOKENS" = 2048
+  source                 = "./modules/sagemaker_deployment"
+  model_name             = "gpt-neo-125m"
+  sns_success_topic_arn  = module.sagemaker_output_mover.sns_success_topic_arn
+  execution_role_arn     = module.iam.inference_role
+  container_image        = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.0-gpu-py310-cu121-ubuntu20.04"
+  uncompressed_model_uri = "s3://jumpstart-cache-prod-eu-west-2/huggingface-textgeneration1/huggingface-textgeneration1-gpt-neo-125m/artifacts/inference-prepack/v2.0.0/"
+  environment_variables = {
+    "ENDPOINT_SERVER_TIMEOUT" : "3600",
+    "HF_MODEL_ID" : "/opt/ml/model",
+    "MAX_INPUT_LENGTH" : "1024",
+    "MAX_TOTAL_TOKENS" : "2048",
+    "MODEL_CACHE_ROOT" : "/opt/ml/model",
+    "SAGEMAKER_ENV" : "1",
+    "SAGEMAKER_MODEL_SERVER_WORKERS" : "1",
+    "SAGEMAKER_PROGRAM" : "inference.py",
+    "SM_NUM_GPUS" : "1"
   }
   security_group_ids        = [aws_security_group.notebooks.id]
   subnets                   = aws_subnet.private_without_egress.*.id
   endpoint_config_name      = "sagemaker-endpoint-config-gpt-neo-125m"
   endpoint_name             = "gpt-neo-125-endpoint"
-  variant_name              = "gpt-neo-125m-endpoint-example"
+  variant_name              = "gpt-neo-125m-endpoint-dev"
   instance_type             = "ml.g5.2xlarge"
   s3_output_path            = "https://${module.iam.default_sagemaker_bucket.bucket_regional_domain_name}"
   initial_instance_count    = 1
@@ -43,6 +55,7 @@ module "gpt_neo_125_deployment" {
       period              = 30
       statistic           = "Average"
       alarm_actions       = [module.gpt_neo_125_deployment.scale_up_policy_arn]
+      sns_topic_name      = "backlog-alarm-${module.gpt_neo_125_deployment.endpoint_name}"
     },
     {
       alarm_name          = "low-cpu-alarm-${module.gpt_neo_125_deployment.endpoint_name}"
@@ -56,6 +69,7 @@ module "gpt_neo_125_deployment" {
       period              = 60
       statistic           = "Average"
       alarm_actions       = [module.gpt_neo_125_deployment.scale_in_to_zero_policy_arn]
+      sns_topic_name      = "low-cpu-alert-${module.gpt_neo_125_deployment.endpoint_name}"
     },
     {
       alarm_name          = "no-query-in-backlog-alarm-${module.gpt_neo_125_deployment.endpoint_name}"
@@ -183,33 +197,34 @@ module "gpt_neo_125_deployment" {
       alarm_actions       = [module.sns.unauthorised_access_sns_topic_arn]
     }
   ]
-
 }
 
 
-##################################################################################################################
-# Llama 3.2 1B parameter endpoint and associated alarms and policies
-#################################################################################################################
-
+###############
+# Llama 3.2 1B
+###############
 module "llama_3_2_1b_deployment" {
-  source                = "./modules/sagemaker_deployment"
-  model_name            = "Llama-3-2-1B"
-  sns_success_topic_arn = module.sagemaker_output_mover.sns_success_topic_arn
-  execution_role_arn    = module.iam.inference_role
-  container_image       = var.hugging_face_model_image
-  model_data_url        = "${var.sagemaker_models_folder}/Llama-3.2-1B.tar.gz"
-  environment = {
-    "HF_MODEL_ID"      = "/opt/ml/model/"
-    "SM_NUM_GPUS"      = 1
-    "MAX_INPUT_LENGTH" = 1024
-    "MAX_TOTAL_TOKENS" = 2048
+  source                 = "./modules/sagemaker_deployment"
+  model_name             = "llama-3-2-1b"
+  sns_success_topic_arn  = module.sagemaker_output_mover.sns_success_topic_arn
+  execution_role_arn     = module.iam.inference_role
+  container_image        = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.29.0-lmi11.0.0-cu124"
+  uncompressed_model_uri = "s3://jumpstart-private-cache-prod-eu-west-2/meta-textgeneration/meta-textgeneration-llama-3-2-1b/artifacts/inference-prepack/v1.0.0/"
+  environment_variables = {
+    "ENDPOINT_SERVER_TIMEOUT" : "3600",
+    "HF_MODEL_ID" : "/opt/ml/model",
+    "MODEL_CACHE_ROOT" : "/opt/ml/model",
+    "OPTION_ENABLE_CHUNKED_PREFILL" : "true",
+    "SAGEMAKER_ENV" : "1",
+    "SAGEMAKER_MODEL_SERVER_WORKERS" : "1",
+    "SAGEMAKER_PROGRAM" : "inference.py"
   }
   security_group_ids        = [aws_security_group.notebooks.id]
   subnets                   = aws_subnet.private_without_egress.*.id
   endpoint_config_name      = "sagemaker-endpoint-config-llama-3-2-1B"
   endpoint_name             = "llama-3-2-1b-endpoint"
-  variant_name              = "llama-3-2-1B-endpoint-example"
-  instance_type             = "ml.g5.2xlarge"
+  variant_name              = "llama-3-2-1b-endpoint-dev"
+  instance_type             = "ml.g6.xlarge"
   initial_instance_count    = 1
   s3_output_path            = "https://${module.iam.default_sagemaker_bucket.bucket_regional_domain_name}"
   max_capacity              = 2
@@ -373,10 +388,3 @@ module "llama_3_2_1b_deployment" {
     }
   ]
 }
-
-locals {
-  all_endpoint_names = [
-    module.gpt_neo_125_deployment.endpoint_name,
-    module.llama_3_2_1b_deployment.endpoint_name
-  ]
-}
diff --git a/infra/vpc.tf b/infra/vpc.tf
@@ -537,14 +537,31 @@ data "aws_iam_policy_document" "datasets_s3_endpoint" {
       }
 
       actions = [
-        "s3:GetObject",
+        "s3:GetObject"
       ]
 
       resources = [
         "arn:aws:s3:::prod-${data.aws_region.aws_region.name}-starport-layer-bucket/*",
       ]
     }
   }
+  dynamic "statement" {
+    for_each = var.arango_on ? [0] : []
+    content {
+      principals {
+        type        = "AWS"
+        identifiers = ["*"]
+      }
+
+      actions = [
+        "s3:ListBucket",
+      ]
+
+      resources = [
+        "arn:aws:s3:::prod-${data.aws_region.aws_region.name}-starport-layer-bucket",
+      ]
+    }
+  }
 }
 
 resource "aws_vpc_endpoint" "datasets_ec2_endpoint" {