Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to pull NIM profiles from GCS cache #93

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cloud-service-providers/google-cloud/gke/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ cd nim-deploy/cloud-service-providers/google-cloud/gke

| Variable | Description | Default | Need update? |
|---|---|---|---|
| `registry_server` | NVIDIA Registry that hosts the images | `nvcr.io` | *No* |
| `ngc_transfer_repository` | NVIDIA Registry that hosts the images | `nvcr.io` | *No* |
| `ngc_api_key` | NGC API Key from NVIDIA | <> | *Yes* |
| `repository` | NIM image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `tag` | Tag of image | `1.0.0` | *No* |
| `ngc_nim_repository` | NIM image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `ngc_nim_tag` | Tag of NIM image | `1.0.0` | *No* |
| `ngc_transfer_repository` | NGC transfer image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `ngc_transfer_tag` | Tag of NGC transfer image | `1.0.0` | *No* |
| `model_name` | NIM Model name | `meta/llama3-8b-instruct` | *No* |
| `gpu_limits` | GPU Limits | `1` | *No* |

Expand All @@ -125,7 +127,7 @@ imagePullSecrets:
model:
name:
ngcAPISecret: ngc-api
nimCache: /.cache
nimCache: /opt/nim/llm/.cache
persistence:
enabled: true
existingClaim: "ngc-cache"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y aria2 pigz
ENTRYPOINT ["/bin/sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# use --token-format=full for print-identity-token if using GCE VM.
cat <<EOF > req.cred.json
{
"bucket": "${NIM_GCS_BUCKET}",
"text": "${NGC_EULA_TEXT}",
"textb64": "$(echo ${NGC_EULA_TEXT} | base64 -w0)",
"jwt": "$(gcloud auth print-identity-token)"
}
EOF

HTTP_URL="$(curl -s -X POST -H 'accept: application/json' -H 'Content-Type: application/json' -d @req.cred.json "https://${SERVICE_FQDN}/v1/request/${GCS_FILENAME}" | sed 's/.*\(https.*\)\\\\n.*/\1/g')"
echo -n "$HTTP_URL"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ extraVolumes:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: "ngc-gcs-cache"
mountOptions: "max-conns-per-host=0"
mountOptions: "implicit-dirs,max-conns-per-host=0"
extraVolumeMounts:
cache-volume:
mountPath: /upload-dir
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -euo pipefail

export CACHE_PATH="$NIM_CACHE_PATH"

if [ -n "${NGC_BUNDLE_URL:-}" ]; then
# Create a sub-directory, as tar tries to modify the parent folder permissions
export CACHE_PATH="$NIM_CACHE_PATH/cache"
mkdir "$CACHE_PATH"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be mkdir -p?

MODEL_BUNDLE_FILENAME="model.tar"
# Fetch and extract from the provided URL, with max concurrency
aria2c -x 16 -s 16 -j 10 --dir "$CACHE_PATH" --out="$MODEL_BUNDLE_FILENAME" "$NGC_BUNDLE_URL"
tar xf "$CACHE_PATH/$MODEL_BUNDLE_FILENAME" -C "$CACHE_PATH"
rm "$CACHE_PATH/$MODEL_BUNDLE_FILENAME"
else
# Fetch directly from NGC to $NIM_CACHE_PATH
download-to-cache
fi

find $CACHE_PATH -type d -printf '%P\n' | xargs -P 100 -I {} mkdir -p /upload-dir/{}
find $CACHE_PATH -type f,l -printf '%P\n' | xargs -P 100 -I {} cp --no-dereference $CACHE_PATH/{} /upload-dir/{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Release.Name }}-scripts-configmap
labels:
{{- include "nim-llm.labels" . | nindent 4 }}
data:
ngc_pull.sh: |-
{{ .Files.Get "files/ngc_pull.sh" | indent 4 }}

Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,25 @@ spec:
{{- toYaml .Values.containerSecurityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /bin/bash
- -c
- "download-to-cache && find $NIM_CACHE_PATH -type d -printf '%P\\n' | xargs -P 100 -I {} mkdir -p /upload-dir/{} && find $NIM_CACHE_PATH -type f,l -printf '%P\\n' | xargs -P 100 -I {} cp --no-dereference $NIM_CACHE_PATH/{} /upload-dir/{}"
command: ["/bin/sh", "-c"]
args: ["/scripts/ngc_pull.sh"]
env:
- name: NIM_CACHE_PATH
value: {{ .Values.model.nimCache | quote }}
{{- if .Values.model.ngcAPISecret }}
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcAPISecret }}
key: NGC_API_KEY
{{- end }}
{{- if .Values.model.ngcBundleURLSecret }}
- name: NGC_BUNDLE_URL
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcBundleURLSecret }}
key: NGC_BUNDLE_URL
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
Expand All @@ -56,6 +63,8 @@ spec:
{{- else }}
mountPath: {{ .Values.model.nimCache }}
{{- end }}
- mountPath: /scripts
name: scripts-volume
{{- if .Values.extraVolumeMounts }}
{{- range $k, $v := .Values.extraVolumeMounts }}
- name: {{ $k }}
Expand Down Expand Up @@ -92,6 +101,10 @@ spec:
{{- else }}
emptyDir: {}
{{- end }}
- name: scripts-volume
configMap:
name: {{ .Release.Name }}-scripts-configmap
defaultMode: 0555
{{- if .Values.extraVolumes }}
{{- range $k, $v := .Values.extraVolumes }}
- name: {{ $k }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ spec:
{{- end }}
{{- if .Values.persistence.mountOptions }}
mountOptions:
- {{ .Values.persistence.mountOptions | quote }}
{{- range .Values.persistence.mountOptions }}
- {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.persistence.csi }}
csi:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ model: # most of these values only matter if not using customCommand
nimCache: /model-store
name: my-model # optionsl name of the model in the OpenAI API -- used in `helm test`
ngcAPISecret: ngc-api
# ngcBundleURLSecret: ngc-bundle-url
openaiPort: 8000
labels: {} # any extra labels desired on deployed pods
jsonLogging: true
Expand Down
65 changes: 55 additions & 10 deletions cloud-service-providers/google-cloud/gke/infra/3-config/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data "google_project" "current" {
locals {
cluster_name = data.terraform_remote_state.gke-cluster.outputs.cluster_name
cluster_location = data.terraform_remote_state.gke-cluster.outputs.cluster_location
use_bundle_url = var.ngc_bundle_gcs_bucket != "" && var.ngc_bundle_filename != ""
}

provider "kubernetes" {
Expand All @@ -48,13 +49,39 @@ resource "null_resource" "get-credentials" {

}

data "local_file" "ngc-eula" {
filename = "${path.module}/NIM_GKE_GCS_SIGNED_URL_EULA"
}

resource "null_resource" "get-signed-ngc-bundle-url" {
count = local.use_bundle_url ? 1 : 0
triggers = {
shell_hash = "${sha256(file("${path.module}/fetch-ngc-url.sh"))}"
}
provisioner "local-exec" {
command = "./fetch-ngc-url.sh > ${path.module}/ngc_signed_url.txt"
environment = {
NGC_EULA_TEXT = "${data.local_file.ngc-eula.content}"
NIM_GCS_BUCKET = "${var.ngc_bundle_gcs_bucket}"
GCS_FILENAME = "${var.ngc_bundle_filename}"
SERVICE_FQDN = "nim-gke-gcs-signed-url-722708171432.us-central1.run.app"
}
}
}

data "local_file" "ngc-bundle-url" {
count = local.use_bundle_url ? 1 : 0
filename = "${path.module}/ngc_signed_url.txt"
depends_on = [null_resource.get-signed-ngc-bundle-url]
}

resource "kubernetes_namespace" "nim" {
metadata {
name = "nim"
}
}

resource "kubernetes_secret" "registry_secret" {
resource "kubernetes_secret" "ngc_registry_secret" {
metadata {
name = "registry-secret"
namespace = "nim"
Expand All @@ -65,7 +92,7 @@ resource "kubernetes_secret" "registry_secret" {
data = {
".dockerconfigjson" = jsonencode({
"auths" = {
"${var.registry_server}" = {
"${var.ngc_registry_server}" = {
"username" = var.ngc_username
"password" = var.ngc_api_key
"auth" = base64encode("${var.ngc_username}:${var.ngc_api_key}")
Expand All @@ -90,7 +117,22 @@ resource "kubernetes_secret" "ngc_api" {
}

depends_on = [kubernetes_namespace.nim]
}

resource "kubernetes_secret" "ngc_bundle_url" {
count = local.use_bundle_url ? 1 : 0
metadata {
name = "ngc-bundle-url"
namespace = "nim"
}

type = "Opaque" # Generic secret type

data = {
"NGC_BUNDLE_URL" = "${data.local_file.ngc-bundle-url[0].content}"
}

depends_on = [kubernetes_namespace.nim]
}

resource "kubernetes_service_account" "ngc_gcs_ksa" {
Expand All @@ -101,9 +143,12 @@ resource "kubernetes_service_account" "ngc_gcs_ksa" {
depends_on = [kubernetes_namespace.nim]
}

resource "random_uuid" "gcs_cache_uuid" {
}

resource "google_storage_bucket" "ngc_gcs_cache" {
project = data.google_project.current.name
name = "${data.google_project.current.name}-ngc-gcs-cache"
name = "ngc-gcs-cache-${random_uuid.gcs_cache_uuid.result}"
location = "US"
force_destroy = true

Expand Down Expand Up @@ -143,12 +188,12 @@ resource "helm_release" "ngc_to_gcs_transfer" {

set {
name = "image.repository"
value = var.repository
value = var.ngc_transfer_repository
}

set {
name = "image.tag"
value = var.tag
value = var.ngc_transfer_tag
}

set {
Expand All @@ -166,9 +211,9 @@ resource "helm_release" "ngc_to_gcs_transfer" {
value = var.gpu_limits
}

depends_on = [kubernetes_secret.ngc_api, google_storage_bucket_iam_binding.ngc_gcs_ksa_binding]
depends_on = [kubernetes_secret.ngc_api, kubernetes_secret.ngc_bundle_url, google_storage_bucket_iam_binding.ngc_gcs_ksa_binding]

timeout = 900
timeout = 3600
wait = true
}

Expand All @@ -184,17 +229,17 @@ resource "helm_release" "my_nim" {

set {
name = "csi.volumeAttributes.bucketName"
value = google_storage_bucket.ngc_gcs_cache.name
value = "ngc-gcs-cache-5f0f6937-fad0-1df7-025e-a912ebf61647"
Copy link
Collaborator

@supertetelman supertetelman Oct 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we mean for this to be a static value? Is there anything significant about this specific bucket?

}

set {
name = "image.repository"
value = var.repository
value = var.ngc_nim_repository
}

set {
name = "image.tag"
value = var.tag
value = var.ngc_nim_tag
}

set {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


variable "registry_server" {
variable "ngc_registry_server" {
type = string
default = "nvcr.io"
description = "Registry that hosts the NIM images"
Expand All @@ -33,16 +33,36 @@ variable "ngc_api_key" {
sensitive = true
}

variable "repository" {
variable "ngc_transfer_repository" {
type = string
description = "Docker image of NGC transfer container"
}

variable "ngc_transfer_tag" {
type = string
description = "Docker repository tag of the NGC transfer container"
}

variable "ngc_nim_repository" {
type = string
description = "Docker image of NIM container"
}

variable "tag" {
variable "ngc_nim_tag" {
type = string
description = "Docker repository tag of NIM container"
}

variable "ngc_bundle_gcs_bucket" {
type = string
description = "GCS bucket containing NGC bucket with NIM profiles"
}

variable "ngc_bundle_filename" {
type = string
description = "Filename containing NIM profiles from NGC"
}

variable "model_name" {
type = string
description = "Name of the NIM model"
Expand Down
2 changes: 2 additions & 0 deletions helm/nim-llm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,13 @@ spec:
env:
- name: NIM_CACHE_PATH
value: {{ .Values.model.nimCache | quote }}
{{- if .Values.model.ngcAPISecret }}
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcAPISecret }}
key: NGC_API_KEY
{{- end }}
- name: OUTLINES_CACHE_DIR
value: /tmp/outlines
- name: NIM_SERVER_PORT
Expand Down