Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for persistent storage and retrieval of DPU reboot-cause #169

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions scripts/determine-reboot-cause
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ VERSION = "1.0"

SYSLOG_IDENTIFIER = "determine-reboot-cause"

MAX_HISTORY_FILES = 10
REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_MODULE_DIR = "/host/reboot-cause/module"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "reboot-cause.txt")
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
Expand Down Expand Up @@ -207,6 +209,43 @@ def determine_reboot_cause():

return previous_reboot_cause, additional_reboot_info

def check_and_create_dpu_dirs():
# Get platform using device_info.get_platform()
platform_info = device_info.get_platform_info()
platform = platform_info['platform']

# Use os.path.join() to create the path to platform.json
platform_path = os.path.join("/usr/share/sonic/device", platform, "platform.json")

# Check if platform.json exists
if not os.path.exists(platform_path):
sonic_logger.log_error(f"Platform file {platform_path} not found")
return

# Read the platform.json file to get DPU information
with open(platform_path, 'r') as f:
platform_data = json.load(f)

# Get the list of DPUs from the platform data
dpus = platform_data.get('DPUS', [])

# Create directories for each DPU and its history
for dpu in dpus:
dpu_dir = os.path.join(REBOOT_CAUSE_MODULE_DIR, dpu)
history_dir = os.path.join(dpu_dir, "history")

# Create the DPU directory if it doesn't exist
if not os.path.exists(dpu_dir):
os.makedirs(dpu_dir)

# Create reboot-cause.txt and write 'First boot' to it
reboot_file = os.path.join(dpu_dir, 'reboot-cause.txt')
with open(reboot_file, 'w') as f:
f.write('First boot\n')

# Create the history directory if it doesn't exist
if not os.path.exists(history_dir):
os.makedirs(history_dir)

def main():
# Configure logger to log all messages INFO level and higher
Expand Down Expand Up @@ -257,6 +296,10 @@ def main():
with open(REBOOT_CAUSE_FILE, "w") as cause_file:
cause_file.write(REBOOT_CAUSE_UNKNOWN)

# Craete directories for DPUs in SmartSwitch platforms
if device_info.is_smartswitch():
check_and_create_dpu_dirs()


if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions scripts/process-reboot-cause
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ try:

from swsscommon import swsscommon
from sonic_py_common import syslogger
from sonic_py_common import device_info
except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

Expand All @@ -28,6 +29,7 @@ USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}]

REBOOT_CAUSE_UNKNOWN = "Unknown"
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE"
MAX_HISTORY_FILES = 10

REDIS_HOSTIP = "127.0.0.1"
state_db = None
Expand Down Expand Up @@ -69,6 +71,84 @@ def read_reboot_cause_files_and_save_state_db():
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
os.remove(x)

def get_dpus():
"""Retrieve the list of DPUs from the platform.json file."""
dpus = []

try:
# Get platform information
platform_info = device_info.get_platform_info()
platform = platform_info.get('platform')
if not platform:
raise KeyError("Platform key missing from platform_info")

# Path to platform.json
platform_path = os.path.join("/usr/share/sonic/device", platform, "platform.json")
sonic_logger.log_info(f"Reading platform JSON from: {platform_path}")

with open(platform_path, 'r') as f:
platform_data = json.load(f)

dpus = platform_data.get('DPUS', [])
if not dpus:
sonic_logger.log_warning("No DPUS found in platform.json")

except Exception as e:
sonic_logger.log_error(f"Error retrieving DPUs: {e}")

return dpus

def get_sorted_reboot_cause_files(dpu_history_path):
"""Retrieve and sort the reboot cause files for a specific DPU."""
try:
# Assuming you have a way to list the files in the directory
files = os.listdir(dpu_history_path)
# Filter and sort the files based on your criteria (e.g., by modification time)
sorted_files = sorted(
[os.path.join(dpu_history_path, f) for f in files if f.endswith('.json')],
key=os.path.getmtime, # Sort by modification time
reverse=True # Most recent first
)
return sorted_files
except Exception as e:
sonic_logger.log_error(f"Error retrieving reboot cause files for {dpu_history_path}: {e}")
return []


def read_dpu_reboot_cause_files_and_save_chassis_state_db():
"""Retrieve reboot cause from history files and save them to StateDB."""
try:
# Get the DPUs from the platform configuration
dpus = get_dpus()
history_dir = '/host/reboot-cause/module'

for dpu in dpus:
dpu_history_path = os.path.join(history_dir, dpu)

# Get sorted reboot cause files for the DPU
reboot_files = get_sorted_reboot_cause_files(os.path.join(dpu_history_path, "history"))

for reboot_file in reboot_files:
if os.path.isfile(reboot_file):
with open(reboot_file, "r") as cause_file:
try:
data = json.load(cause_file)
# Ensure keys exist
if 'gen_time' not in data:
sonic_logger.log_warning(f"Missing 'gen_time' in data from {reboot_file}")
continue # Skip this file

_hash = f"{REBOOT_CAUSE_TABLE_NAME}|{data['gen_time']}"
state_db.set(state_db.STATE_DB, _hash, 'cause', data.get('cause', ''))
state_db.set(state_db.STATE_DB, _hash, 'time', data.get('time', ''))
state_db.set(state_db.STATE_DB, _hash, 'user', data.get('user', ''))
state_db.set(state_db.STATE_DB, _hash, 'comment', data.get('comment', ''))

except json.decoder.JSONDecodeError as je:
sonic_logger.log_info(f"Unable to process reload cause file {reboot_file}: {je}")
continue # Skip this file
except Exception as e:
sonic_logger.log_err(f"Error reading DPU reboot causes: {e}")

def main():
# Configure logger to log all messages INFO level and higher
Expand Down Expand Up @@ -99,6 +179,10 @@ def main():
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db
read_reboot_cause_files_and_save_state_db()

# For smartswitch platform store the DPU reboot-cause to CHASSIS_STATE_DB
if device_info.is_smartswitch():
read_dpu_reboot_cause_files_and_save_chassis_state_db()


if __name__ == "__main__":
main()
38 changes: 38 additions & 0 deletions tests/determine-reboot-cause_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import shutil
import pytest
import json

from swsscommon import swsscommon
from sonic_py_common.general import load_module_from_source
Expand Down Expand Up @@ -33,6 +34,8 @@
determine_reboot_cause_path = os.path.join(scripts_path, 'determine-reboot-cause')
determine_reboot_cause = load_module_from_source('determine_reboot_cause', determine_reboot_cause_path)

# Gte the function to create dpu dir
check_and_create_dpu_dirs = determine_reboot_cause.check_and_create_dpu_dirs

PROC_CMDLINE_CONTENTS = """\
BOOT_IMAGE=/image-20191130.52/boot/vmlinuz-4.9.0-11-2-amd64 root=/dev/sda4 rw console=tty0 console=ttyS1,9600n8 quiet net.ifnames=0 biosdevname=0 loop=image-20191130.52/fs.squashfs loopfstype=squashfs apparmor=1 security=apparmor varlog_size=4096 usbcore.autosuspend=-1 module_blacklist=gpio_ich SONIC_BOOT_TYPE=warm"""
Expand Down Expand Up @@ -71,6 +74,8 @@
EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2021_3_28_13_48_49', 'cause': 'Kernel Panic', 'user': 'N/A', 'time': 'Sun Mar 28 13:45:12 UTC 2021'}

REBOOT_CAUSE_DIR="host/reboot-cause/"
PLATFORM_JSON_PATH = "/usr/share/sonic/device/test_platform/platform.json"
REBOOT_CAUSE_MODULE_DIR = "host/reboot-cause"

class TestDetermineRebootCause(object):
def test_parse_warmfast_reboot_from_proc_cmdline(self):
Expand Down Expand Up @@ -199,3 +204,36 @@ def test_determine_reboot_cause_main_with_reboot_cause_dir(self):
determine_reboot_cause.main()
assert os.path.exists("host/reboot-cause/reboot-cause.txt") == True
assert os.path.exists("host/reboot-cause/previous-reboot-cause.json") == True

def create_mock_platform_json(self, dpus):
"""Helper function to create a mock platform.json file."""
os.makedirs(os.path.dirname(PLATFORM_JSON_PATH), exist_ok=True)
with open(PLATFORM_JSON_PATH, "w") as f:
json.dump({"DPUS": dpus}, f)

@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True)
@mock.patch('sonic_py_common.device_info.get_platform', return_value='some_platform')
def test_check_and_create_dpu_dirs(self, mock_get_platform, mock_is_smartswitch):
# Call the function under test
result = check_and_create_dpu_dirs()

@mock.patch('sonic_py_common.device_info.get_platform_info', return_value={'platform': 'some_platform'})
@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True)
@mock.patch('os.path.exists')
@mock.patch('builtins.open', new_callable=mock.mock_open, read_data='{"DPUS": ["dpu0", "dpu1"]}')
@mock.patch('os.makedirs')
def test_check_and_create_dpu_dirs_with_platform_json(self, mock_makedirs, mock_open, mock_exists, mock_is_smartswitch, mock_get_platform_info):
# Mock the platform.json existence
mock_exists.side_effect = lambda path: path == "/usr/share/sonic/device/some_platform/platform.json"

# Call the function under test
check_and_create_dpu_dirs()

# Assert that open was called correctly
mock_open.assert_any_call("/usr/share/sonic/device/some_platform/platform.json", 'r')
mock_open.assert_any_call('/host/reboot-cause/module/dpu0/reboot-cause.txt', 'w')
mock_open.assert_any_call('/host/reboot-cause/module/dpu1/reboot-cause.txt', 'w')

# Assert that makedirs was called for the DPU directories
mock_makedirs.assert_any_call(os.path.join('/host/reboot-cause/module', 'dpu0'))
mock_makedirs.assert_any_call(os.path.join('/host/reboot-cause/module', 'dpu1'))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

read_dpu_reboot_cause_files_and_save_chassis_state_d is not covered by the tests. Please new test for it.

Loading