Skip to content

Commit

Permalink
Incorporated code review feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
ak-org committed Dec 4, 2024
1 parent 5d6add2 commit a7c38b9
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 104 deletions.
1 change: 1 addition & 0 deletions neuron-problem-detector/ecs-npd-cdk/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__/
2 changes: 1 addition & 1 deletion neuron-problem-detector/ecs-npd-cdk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This project contains CDK code to provision :

* An ECS Cluster and one Inf2.xlarge EC2 instance joining the cluster.
* An ECS Task Definition for Neruon Problem Detector and Recovery
* An ECS Task Definition for Neuron Problem Detector and Recovery
* An ECS Service that run the containers as Daemon in all instances
* Related IAM roles and log groups

Expand Down
142 changes: 71 additions & 71 deletions neuron-problem-detector/ecs-npd-cdk/neuron.yaml

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"containerDefinitions": [
{
"name": "npd",
"image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19",
"image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.20",
"cpu": 0,
"portMappings": [
{
Expand All @@ -20,7 +20,7 @@
"-c"
],
"command": [
"echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HANG_ON_COLLECTIVES\",\"pattern\":\".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json"
"echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json"
],
"environment": [],
"mountPoints": [],
Expand Down Expand Up @@ -52,7 +52,7 @@
},
{
"name": "recovery",
"image": "public.ecr.aws/neuron/neuron-node-recovery:1.1.0",
"image": "public.ecr.aws/neuron/neuron-node-recovery:1.3.0",
"cpu": 0,
"portMappings": [],
"essential": true,
Expand Down Expand Up @@ -84,9 +84,6 @@
"systemControls": []
}
],
"executionRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole",
"taskRoleArn": "arn:aws:iam::367244320406:role/ecsTaskExecutionRole",
"networkMode": "awsvpc",
"requiresCompatibilities": [
"EC2"
],
Expand All @@ -96,4 +93,4 @@
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@
aws_autoscaling as autoscaling,
)
from constructs import Construct
import json



class NeuronProblemDetectorStack(Stack):

def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
super().__init__(scope, construct_id, **kwargs)

with open('ecs_task_definition.json', 'r') as f:
ecs_task_definition = json.load(f)

vpc = ec2.Vpc(self, "NeuronProblemDetectorVPC", max_azs=2)

ecs_cluster = ecs.Cluster(self, "NeuronProblemDetectorCluster", vpc=vpc)

ecs_cluster.add_capacity(
id="NeruonAutoScalingGroupCapacity",
id="NeuronAutoScalingGroupCapacity",
machine_image=ecs.EcsOptimizedImage.amazon_linux2(
ecs.AmiHardwareType.NEURON
),
Expand Down Expand Up @@ -91,17 +95,17 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
"NeuronNpdAndRecoveryTaskDef",
family="neuron-npd-and-recovery",
network_mode=ecs.NetworkMode.AWS_VPC,
cpu="1024",
memory_mib="3072",
cpu=ecs_task_definition["cpu"],
memory_mib=ecs_task_definition["memory"],
compatibility=ecs.Compatibility.EC2,
execution_role=task_execution_role,
task_role=task_role
)

# Create the device mapping
device_mapping = ecs.Device(
host_path="/dev/kmsg",
container_path="/dev/kmsg",
host_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["hostPath"],
container_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["containerPath"],
permissions=[ecs.DevicePermission.READ, ecs.DevicePermission.WRITE],
)

Expand All @@ -113,21 +117,19 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
linux_parameters.add_devices(device_mapping)

npd_container = task_definition.add_container(
"npd",
ecs_task_definition["containerDefinitions"][0]["name"],
image=ecs.ContainerImage.from_registry(
"registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19"
ecs_task_definition["containerDefinitions"][0]["image"]
),
entry_point=["/bin/sh", "-c"],
command=[
'echo \'{"plugin":"kmsg","logPath":"/dev/kmsg","lookback":"5m","bufferSize":10,"source":"kernel-monitor","conditions":[{"type":"NeuronHealth","reason":"NeuronHasNoError","message":"Neuronhasnoerror"}],"rules":[{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_SRAM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_NC_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HBM_UNCORRECTABLE_ERROR","pattern":".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_DMA_ERROR","pattern":".*NEURON_HW_ERR=DMA_ERROR.*"},{"type":"permanent","condition":"NeuronHealth","reason":"NeuronHasError_HANG_ON_COLLECTIVES","pattern":".*NEURON_HW_ERR=HANG_ON_COLLECTIVES.*"}]}\' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json'
],
entry_point=ecs_task_definition["containerDefinitions"][0]["entrypoint"],
command=ecs_task_definition["containerDefinitions"][0]["command"],
privileged=True,
logging=ecs.AwsLogDriver(
stream_prefix="ecs",
stream_prefix=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-stream-prefix"],
log_group=logs.LogGroup(
self,
"NpdLogGroup",
log_group_name="/ecs/npd",
log_group_name=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-group"],
retention=logs.RetentionDays.ONE_WEEK,
),
),
Expand All @@ -136,29 +138,31 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:

npd_container.add_port_mappings(
ecs.PortMapping(
name="npd-80-tcp",
container_port=80,
host_port=80,
name=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["name"],
container_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["containerPort"],
host_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["hostPort"],
protocol=ecs.Protocol.TCP,
app_protocol=ecs.AppProtocol.http,
)
)

recovery_container = task_definition.add_container(
"recovery",
ecs_task_definition["containerDefinitions"][1]["name"],
image=ecs.ContainerImage.from_registry(
"public.ecr.aws/neuron/neuron-node-recovery:1.2.0"
ecs_task_definition["containerDefinitions"][1]["image"]
),
entry_point=["/bin/sh", "-c"],
command=["python scripts/check-health.py"],
environment={"ENABLE_RECOVERY": "true"},
readonly_root_filesystem=True,
entry_point=ecs_task_definition["containerDefinitions"][1]["entryPoint"],
command=ecs_task_definition["containerDefinitions"][1]["command"],
environment={
ecs_task_definition["containerDefinitions"][1]["environment"][0]["name"]: ecs_task_definition["containerDefinitions"][1]["environment"][0]["value"]
},
readonly_root_filesystem=ecs_task_definition["containerDefinitions"][1]["readonlyRootFilesystem"],
logging=ecs.AwsLogDriver(
stream_prefix="ecs",
stream_prefix=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-stream-prefix"],
log_group=logs.LogGroup(
self,
"RecoveryLogGroup",
log_group_name="/ecs/recovery",
log_group_name=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-group"],
retention=logs.RetentionDays.ONE_WEEK,
),
),
Expand Down

0 comments on commit a7c38b9

Please sign in to comment.