Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Focused efforts on addressing bugs identified in the obdiag 3.0.0 iteration #653

Merged
merged 15 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions plugins/check/tasks/observer/system/instruction_set_avx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
info: "Check the flags of cpu"
task:
- steps:
- type: ssh
ssh: "lscpu |grep Flags"
result:
set_value: cpu_flags
verify: " [[ $cpu_flags == *avx* ]] "
err_msg: "observer need cpu support avx. If the cpu is not support avx, observer will be crash."
15 changes: 14 additions & 1 deletion rpm/init.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
#!/usr/bin/env bash

CURRENT_USER_ID=$(id -u)
CURRENT_USER_NAME=$(logname 2>/dev/null || echo "$SUDO_USER" | awk -F'[^a-zA-Z0-9_]' '{print $1}')

if [ "$CURRENT_USER_ID" -eq 0 ]; then
if [ -n "$SUDO_USER" ]; then
USER_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6)
else
USER_HOME=/root
fi
else
USER_HOME="$HOME"
fi

if [[ $# == 1 && $1 == "-f" ]]; then
FORCE_DEPLOY="1"
else
Expand All @@ -11,7 +24,7 @@ WORK_DIR=$(readlink -f "$(dirname ${BASH_SOURCE[0]})")
if [ ${OBDIAG_HOME} ]; then
OBDIAG_HOME=${OBDIAG_HOME}
else
OBDIAG_HOME="${HOME}/.obdiag"
OBDIAG_HOME="${USER_HOME}/.obdiag"
fi

mkdir -p ${OBDIAG_HOME} && cd ${OBDIAG_HOME}
Expand Down
87 changes: 61 additions & 26 deletions rpm/obdiag_backup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,30 +48,31 @@ TIMESTAMP=$(date +"%Y%m%d%H%M%S")
BASE_NAME="obdiag_backup${VERSION:+_v$VERSION}"
TARFILE="$BACKUP_DIR/${BASE_NAME}_$TIMESTAMP.tar.gz"

# Check if a file with the same name already exists in the BACKUP_DIR
# Check if a file with the same base name already exists in the BACKUP_DIR
if find "$BACKUP_DIR" -maxdepth 1 -name "${BASE_NAME}_*.tar.gz" -print -quit | grep -q .; then
echo "A backup file with the same name already exists. Skipping backup creation."
exit 0
echo "A backup file with the same base name already exists. Skipping backup creation."
exit 0
fi

# Temporary directory for staging backup files
# Temporary directory for staging backup files, including top-level directory
TEMP_BACKUP_DIR="$BACKUP_DIR/tmp_obdiag_backup_$TIMESTAMP"
mkdir -p "$TEMP_BACKUP_DIR"
TOP_LEVEL_DIR="$TEMP_BACKUP_DIR/obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP" # Top-level directory inside the tarball
mkdir -p "$TOP_LEVEL_DIR"

# Iterate over each directory to be backed up
for dir in "${DIRS[@]}"; do
# Check if the source directory exists
if [ -d "$SOURCE_DIR$dir" ]; then
# Copy the directory into the temporary backup directory
cp -rp "$SOURCE_DIR$dir" "$TEMP_BACKUP_DIR/"
echo "Copied $dir to temporary backup directory."
# Copy the directory into the top-level directory within the temporary backup directory
cp -rp "$SOURCE_DIR$dir" "$TOP_LEVEL_DIR/"
echo "Copied $dir to temporary backup directory under ${BASE_NAME}_$TIMESTAMP."
else
echo "Source directory $SOURCE_DIR$dir does not exist. Skipping."
fi
done

# Create a tar.gz archive
if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" .; then
# Create a tar.gz archive with the top-level directory included
if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" "obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP"; then
echo "Backup archive created successfully at $TARFILE"
else
echo "Failed to create backup archive."
Expand All @@ -85,19 +86,53 @@ echo "Temporary files removed."
# Cleanup phase: Remove backups older than one year or delete the oldest backups if more than 12 exist
ONE_YEAR_AGO="+365" # find command uses days, so +365 means older than one year

# Remove backups older than one year
find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO -exec rm -f {} \;
echo "Removed old backup files older than one year."

# If there are more than 12 backups, remove the excess oldest ones
BACKUP_FILES=($(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n))
NUM_BACKUPS=${#BACKUP_FILES[@]}

if [ $NUM_BACKUPS -gt 12 ]; then
COUNT_TO_DELETE=$((NUM_BACKUPS - 12))
for ((i = 0; i < COUNT_TO_DELETE; i++)); do
FILE_PATH=${BACKUP_FILES[i]#* }
rm -f "$FILE_PATH"
echo "Removed excess backup file: $FILE_PATH"
done
fi
# Function to remove a single oldest backup file and print the action
remove_oldest_backup() {
BACKUP_FILE=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T+ %p\n' | sort | head -n 1 | cut -d ' ' -f2-)
if [ -n "$BACKUP_FILE" ]; then
echo "Attempting to remove oldest backup file: $BACKUP_FILE"
if rm -f "$BACKUP_FILE"; then
echo "Successfully removed oldest backup file: $BACKUP_FILE"
return 0
else
echo "Failed to remove oldest backup file: $BACKUP_FILE"
return 1
fi
else
echo "No backup files found."
return 1
fi
}

# Function to check if there are backups older than one year
has_old_backups() {
if find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO | grep -q .; then
echo "Found old backup files."
return 0
else
echo "No old backup files found."
return 1
fi
}

# Function to check if there are more than 12 backups
has_too_many_backups() {
COUNT=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f | wc -l)
if [ $COUNT -gt 12 ]; then
echo "More than 12 backup files found: $COUNT"
return 0
else
echo "Backup count within limit: $COUNT"
return 1
fi
}

# Cleanup loop: Remove only one file at a time until neither condition is met
echo "Starting cleanup process..."
while has_old_backups || has_too_many_backups; do
if ! remove_oldest_backup; then
echo "Cleanup process stopped due to failure in removing oldest backup."
break # Stop if no more files to remove or removal failed
fi
done
echo "Cleanup process completed."
4 changes: 2 additions & 2 deletions src/common/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def gather_function(self, function_type, opt):
since=Util.get_option(options, 'since'),
grep=Util.get_option(options, 'grep'),
store_dir=Util.get_option(options, 'store_dir'),
temp_dir=Util.get_option(options, 'temp_dir'),
temp_dir="/tmp",
redact=Util.get_option(options, 'redact'),
)
return handler_obproxy.handle()
Expand Down Expand Up @@ -360,7 +360,7 @@ def gather_obproxy_log(self, opt):
scope=Util.get_option(options, 'scope'),
grep=Util.get_option(options, 'grep'),
store_dir=Util.get_option(options, 'store_dir'),
temp_dir=Util.get_option(options, 'temp_dir'),
temp_dir="/tmp",
redact=Util.get_option(options, 'redact'),
)
return handler.handle()
Expand Down
12 changes: 0 additions & 12 deletions src/common/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,18 +1327,6 @@ def replacer(match):

return re.sub(r'#\{(\w+)\}', replacer, s)

@staticmethod
def build_str_on_expr_by_dict_2(expr, variable_dict, stdio=None):
s = expr
d = variable_dict

def replacer(match):
key = match.group(1)
value = str(d.get(key, match.group(0)))
return f"{value}"

return re.sub(r'\$\{(\w+)\}', replacer, s)

@staticmethod
def build_sql_on_expr_by_dict(expr, variable_dict, stdio=None):
s = expr
Expand Down
9 changes: 6 additions & 3 deletions src/handler/analyzer/analyze_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,13 +645,16 @@ def __parse_log_lines(self, file_full_path, memory_dict):
if '[MEMORY]' in line or 'MemDump' in line or 'ob_tenant_ctx_allocator' in line:
if '[MEMORY] tenant:' in line:
tenant_id = line.split('tenant:')[1].split(',')[0].strip()
hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip()
rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip()
if 'rpc_' in line:
hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip()
rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip()
tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes)
else:
hold_bytes = line.split('hold:')[1].split('cache_')[0].strip()
cache_hold_bytes = line.split('cache_hold:')[1].split('cache_used')[0].strip()
cache_used_bytes = line.split('cache_used:')[1].split('cache_item_count')[0].strip()
cache_item_count = line.split('cache_item_count:')[1].strip()
tenant_dict['hold'] = self.__convert_string_bytes_2_int_bytes(hold_bytes)
tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes)
tenant_dict['cache_hold'] = self.__convert_string_bytes_2_int_bytes(cache_hold_bytes)
tenant_dict['cache_used'] = self.__convert_string_bytes_2_int_bytes(cache_used_bytes)
tenant_dict['cache_item_count'] = self.__convert_string_bytes_2_int_bytes(cache_item_count)
Expand Down
2 changes: 1 addition & 1 deletion src/handler/display/step/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def execute(self):
if "ssh" not in self.step:
self.stdio.error("SshHandler execute ssh is not set")
return
ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict)
ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict)
self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd))
ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd)
if ssh_report_value is None:
Expand Down
22 changes: 19 additions & 3 deletions src/handler/gather/gather_component_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def init(self, context, *args, **kwargs):
self.scope = "all"
self.grep = kwargs.get('grep', None)
self.store_dir = kwargs.get('store_dir', None)
self.temp_dir = kwargs.get('temp_dir', None)
self.temp_dir = kwargs.get('temp_dir', const.GATHER_LOG_TEMPORARY_DIR_DEFAULT)
self.redact = kwargs.get('redact', None)
self.nodes = kwargs.get('nodes', None)
self.is_scene = kwargs.get('is_scene', False)
Expand All @@ -103,7 +103,7 @@ def init(self, context, *args, **kwargs):
# build config dict for gather log on node
self.gather_log_conf_dict = {
"target": self.target,
"tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT,
"tmp_dir": self.temp_dir,
"scope": self.scope,
"grep": self.grep,
"store_dir": self.store_dir,
Expand Down Expand Up @@ -399,7 +399,12 @@ def handle(self, result_list=None):
self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir))
from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str))
to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str))
tmp_dir = "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target)

tmp_dir = "{0}_log_{1}_{2}_{3}_{4}".format(self.target, self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6])
if self.target == "observer" and self.ssh_client.get_name() == "local":
pid = self.__get_observer_pid(self.node)
if pid:
tmp_dir = "{0}_pid_{1}".format(tmp_dir, pid)
tmp_log_dir = os.path.join(self.tmp_dir, tmp_dir)
# mkdir tmp_log_dir
self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir))
Expand Down Expand Up @@ -583,3 +588,14 @@ def __get_logfile_name_list(self, from_time_str, to_time_str, log_dir, log_files
else:
self.stdio.warn("No found the qualified log file on Server [{0}]".format(self.ssh_client.get_name()))
return log_name_list

def __get_observer_pid(self, node):
pid_file_path = os.path.join(node.get("home_path"), 'run', 'observer.pid')
try:
with open(pid_file_path, 'r') as file:
first_line = file.readline().strip()
return first_line
except FileNotFoundError:
self.stdio.exception(f"Error: The file {pid_file_path} does not exist.")
except Exception as e:
self.stdio.exception(f"An error occurred: {e}")
2 changes: 1 addition & 1 deletion src/handler/gather/gather_obstack2.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def __handle_from_node(self, local_stored_path, node):
if getattr(sys, 'frozen', False):
absPath = os.path.dirname(sys.executable)
else:
absPath = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
absPath = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
obstack2_local_stored_full_path = os.path.join(absPath, const.OBSTACK2_LOCAL_STORED_PATH)
upload_file(ssh_client, obstack2_local_stored_full_path, const.OBSTACK2_DEFAULT_INSTALL_PATH, self.context.stdio)
self.stdio.verbose("Installation of obstack2 is completed and gather begins ...")
Expand Down
2 changes: 1 addition & 1 deletion src/handler/gather/step/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def execute(self):
if "ssh" not in self.step:
self.stdio.error("SshHandler execute ssh is not set")
return
ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict)
ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict)
self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd))
ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd)
if ssh_report_value is None:
Expand Down
4 changes: 2 additions & 2 deletions src/handler/rca/rca_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ def __init__(self, context):
# init input parameters
self.report = None
self.tasks = None
self.context.set_variable("input_parameters", Util.get_option(self.options, "input_parameters"))
self.context.set_variable("env", Util.get_option(self.options, "input_parameters"))
self.context.set_variable("input_parameters", Util.get_option(self.options, "env"))
self.context.set_variable("env", Util.get_option(self.options, "env"))
self.store_dir = Util.get_option(self.options, "store_dir", "./obdiag_rca/")
self.context.set_variable("store_dir", self.store_dir)
self.stdio.verbose(
Expand Down
Loading