Skip to content

Commit

Permalink
Focused efforts on addressing bugs identified in the obdiag 3.0.0 ite…
Browse files Browse the repository at this point in the history
…ration (#653)

* Rename Dockerfile to DockerFile

* Focused efforts on addressing bugs identified in the 3.0.0 iteration

* 3.0.0 fix bug

* fix

* fix
  • Loading branch information
Teingi authored Dec 31, 2024
1 parent 5f8cc82 commit abff0e4
Show file tree
Hide file tree
Showing 11 changed files with 116 additions and 52 deletions.
9 changes: 9 additions & 0 deletions plugins/check/tasks/observer/system/instruction_set_avx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
info: "Check the flags of cpu"
task:
- steps:
- type: ssh
ssh: "lscpu |grep Flags"
result:
set_value: cpu_flags
verify: " [[ $cpu_flags == *avx* ]] "
err_msg: "observer need cpu support avx. If the cpu is not support avx, observer will be crash."
15 changes: 14 additions & 1 deletion rpm/init.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
#!/usr/bin/env bash

CURRENT_USER_ID=$(id -u)
CURRENT_USER_NAME=$(logname 2>/dev/null || echo "$SUDO_USER" | awk -F'[^a-zA-Z0-9_]' '{print $1}')

if [ "$CURRENT_USER_ID" -eq 0 ]; then
if [ -n "$SUDO_USER" ]; then
USER_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6)
else
USER_HOME=/root
fi
else
USER_HOME="$HOME"
fi

if [[ $# == 1 && $1 == "-f" ]]; then
FORCE_DEPLOY="1"
else
Expand All @@ -11,7 +24,7 @@ WORK_DIR=$(readlink -f "$(dirname ${BASH_SOURCE[0]})")
if [ ${OBDIAG_HOME} ]; then
OBDIAG_HOME=${OBDIAG_HOME}
else
OBDIAG_HOME="${HOME}/.obdiag"
OBDIAG_HOME="${USER_HOME}/.obdiag"
fi

mkdir -p ${OBDIAG_HOME} && cd ${OBDIAG_HOME}
Expand Down
87 changes: 61 additions & 26 deletions rpm/obdiag_backup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,30 +48,31 @@ TIMESTAMP=$(date +"%Y%m%d%H%M%S")
BASE_NAME="obdiag_backup${VERSION:+_v$VERSION}"
TARFILE="$BACKUP_DIR/${BASE_NAME}_$TIMESTAMP.tar.gz"

# Check if a file with the same name already exists in the BACKUP_DIR
# Check if a file with the same base name already exists in the BACKUP_DIR
if find "$BACKUP_DIR" -maxdepth 1 -name "${BASE_NAME}_*.tar.gz" -print -quit | grep -q .; then
echo "A backup file with the same name already exists. Skipping backup creation."
exit 0
echo "A backup file with the same base name already exists. Skipping backup creation."
exit 0
fi

# Temporary directory for staging backup files
# Temporary directory for staging backup files, including top-level directory
TEMP_BACKUP_DIR="$BACKUP_DIR/tmp_obdiag_backup_$TIMESTAMP"
mkdir -p "$TEMP_BACKUP_DIR"
TOP_LEVEL_DIR="$TEMP_BACKUP_DIR/obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP" # Top-level directory inside the tarball
mkdir -p "$TOP_LEVEL_DIR"

# Iterate over each directory to be backed up
for dir in "${DIRS[@]}"; do
# Check if the source directory exists
if [ -d "$SOURCE_DIR$dir" ]; then
# Copy the directory into the temporary backup directory
cp -rp "$SOURCE_DIR$dir" "$TEMP_BACKUP_DIR/"
echo "Copied $dir to temporary backup directory."
# Copy the directory into the top-level directory within the temporary backup directory
cp -rp "$SOURCE_DIR$dir" "$TOP_LEVEL_DIR/"
echo "Copied $dir to temporary backup directory under ${BASE_NAME}_$TIMESTAMP."
else
echo "Source directory $SOURCE_DIR$dir does not exist. Skipping."
fi
done

# Create a tar.gz archive
if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" .; then
# Create a tar.gz archive with the top-level directory included
if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" "obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP"; then
echo "Backup archive created successfully at $TARFILE"
else
echo "Failed to create backup archive."
Expand All @@ -85,19 +86,53 @@ echo "Temporary files removed."
# Cleanup phase: Remove backups older than one year or delete the oldest backups if more than 12 exist
ONE_YEAR_AGO="+365" # find command uses days, so +365 means older than one year

# Remove backups older than one year
find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO -exec rm -f {} \;
echo "Removed old backup files older than one year."

# If there are more than 12 backups, remove the excess oldest ones
BACKUP_FILES=($(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n))
NUM_BACKUPS=${#BACKUP_FILES[@]}

if [ $NUM_BACKUPS -gt 12 ]; then
COUNT_TO_DELETE=$((NUM_BACKUPS - 12))
for ((i = 0; i < COUNT_TO_DELETE; i++)); do
FILE_PATH=${BACKUP_FILES[i]#* }
rm -f "$FILE_PATH"
echo "Removed excess backup file: $FILE_PATH"
done
fi
# Function to remove a single oldest backup file and print the action
remove_oldest_backup() {
BACKUP_FILE=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T+ %p\n' | sort | head -n 1 | cut -d ' ' -f2-)
if [ -n "$BACKUP_FILE" ]; then
echo "Attempting to remove oldest backup file: $BACKUP_FILE"
if rm -f "$BACKUP_FILE"; then
echo "Successfully removed oldest backup file: $BACKUP_FILE"
return 0
else
echo "Failed to remove oldest backup file: $BACKUP_FILE"
return 1
fi
else
echo "No backup files found."
return 1
fi
}

# Function to check if there are backups older than one year
has_old_backups() {
if find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO | grep -q .; then
echo "Found old backup files."
return 0
else
echo "No old backup files found."
return 1
fi
}

# Function to check if there are more than 12 backups
has_too_many_backups() {
COUNT=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f | wc -l)
if [ $COUNT -gt 12 ]; then
echo "More than 12 backup files found: $COUNT"
return 0
else
echo "Backup count within limit: $COUNT"
return 1
fi
}

# Cleanup loop: Remove only one file at a time until neither condition is met
echo "Starting cleanup process..."
while has_old_backups || has_too_many_backups; do
if ! remove_oldest_backup; then
echo "Cleanup process stopped due to failure in removing oldest backup."
break # Stop if no more files to remove or removal failed
fi
done
echo "Cleanup process completed."
4 changes: 2 additions & 2 deletions src/common/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def gather_function(self, function_type, opt):
since=Util.get_option(options, 'since'),
grep=Util.get_option(options, 'grep'),
store_dir=Util.get_option(options, 'store_dir'),
temp_dir=Util.get_option(options, 'temp_dir'),
temp_dir="/tmp",
redact=Util.get_option(options, 'redact'),
)
return handler_obproxy.handle()
Expand Down Expand Up @@ -360,7 +360,7 @@ def gather_obproxy_log(self, opt):
scope=Util.get_option(options, 'scope'),
grep=Util.get_option(options, 'grep'),
store_dir=Util.get_option(options, 'store_dir'),
temp_dir=Util.get_option(options, 'temp_dir'),
temp_dir="/tmp",
redact=Util.get_option(options, 'redact'),
)
return handler.handle()
Expand Down
12 changes: 0 additions & 12 deletions src/common/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,18 +1327,6 @@ def replacer(match):

return re.sub(r'#\{(\w+)\}', replacer, s)

@staticmethod
def build_str_on_expr_by_dict_2(expr, variable_dict, stdio=None):
s = expr
d = variable_dict

def replacer(match):
key = match.group(1)
value = str(d.get(key, match.group(0)))
return f"{value}"

return re.sub(r'\$\{(\w+)\}', replacer, s)

@staticmethod
def build_sql_on_expr_by_dict(expr, variable_dict, stdio=None):
s = expr
Expand Down
9 changes: 6 additions & 3 deletions src/handler/analyzer/analyze_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,13 +645,16 @@ def __parse_log_lines(self, file_full_path, memory_dict):
if '[MEMORY]' in line or 'MemDump' in line or 'ob_tenant_ctx_allocator' in line:
if '[MEMORY] tenant:' in line:
tenant_id = line.split('tenant:')[1].split(',')[0].strip()
hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip()
rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip()
if 'rpc_' in line:
hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip()
rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip()
tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes)
else:
hold_bytes = line.split('hold:')[1].split('cache_')[0].strip()
cache_hold_bytes = line.split('cache_hold:')[1].split('cache_used')[0].strip()
cache_used_bytes = line.split('cache_used:')[1].split('cache_item_count')[0].strip()
cache_item_count = line.split('cache_item_count:')[1].strip()
tenant_dict['hold'] = self.__convert_string_bytes_2_int_bytes(hold_bytes)
tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes)
tenant_dict['cache_hold'] = self.__convert_string_bytes_2_int_bytes(cache_hold_bytes)
tenant_dict['cache_used'] = self.__convert_string_bytes_2_int_bytes(cache_used_bytes)
tenant_dict['cache_item_count'] = self.__convert_string_bytes_2_int_bytes(cache_item_count)
Expand Down
2 changes: 1 addition & 1 deletion src/handler/display/step/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def execute(self):
if "ssh" not in self.step:
self.stdio.error("SshHandler execute ssh is not set")
return
ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict)
ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict)
self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd))
ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd)
if ssh_report_value is None:
Expand Down
22 changes: 19 additions & 3 deletions src/handler/gather/gather_component_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def init(self, context, *args, **kwargs):
self.scope = "all"
self.grep = kwargs.get('grep', None)
self.store_dir = kwargs.get('store_dir', None)
self.temp_dir = kwargs.get('temp_dir', None)
self.temp_dir = kwargs.get('temp_dir', const.GATHER_LOG_TEMPORARY_DIR_DEFAULT)
self.redact = kwargs.get('redact', None)
self.nodes = kwargs.get('nodes', None)
self.is_scene = kwargs.get('is_scene', False)
Expand All @@ -103,7 +103,7 @@ def init(self, context, *args, **kwargs):
# build config dict for gather log on node
self.gather_log_conf_dict = {
"target": self.target,
"tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT,
"tmp_dir": self.temp_dir,
"scope": self.scope,
"grep": self.grep,
"store_dir": self.store_dir,
Expand Down Expand Up @@ -399,7 +399,12 @@ def handle(self, result_list=None):
self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir))
from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str))
to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str))
tmp_dir = "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target)

tmp_dir = "{0}_log_{1}_{2}_{3}_{4}".format(self.target, self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6])
if self.target == "observer" and self.ssh_client.get_name() == "local":
pid = self.__get_observer_pid(self.node)
if pid:
tmp_dir = "{0}_pid_{1}".format(tmp_dir, pid)
tmp_log_dir = os.path.join(self.tmp_dir, tmp_dir)
# mkdir tmp_log_dir
self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir))
Expand Down Expand Up @@ -583,3 +588,14 @@ def __get_logfile_name_list(self, from_time_str, to_time_str, log_dir, log_files
else:
self.stdio.warn("No found the qualified log file on Server [{0}]".format(self.ssh_client.get_name()))
return log_name_list

def __get_observer_pid(self, node):
pid_file_path = os.path.join(node.get("home_path"), 'run', 'observer.pid')
try:
with open(pid_file_path, 'r') as file:
first_line = file.readline().strip()
return first_line
except FileNotFoundError:
self.stdio.exception(f"Error: The file {pid_file_path} does not exist.")
except Exception as e:
self.stdio.exception(f"An error occurred: {e}")
2 changes: 1 addition & 1 deletion src/handler/gather/gather_obstack2.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def __handle_from_node(self, local_stored_path, node):
if getattr(sys, 'frozen', False):
absPath = os.path.dirname(sys.executable)
else:
absPath = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
absPath = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
obstack2_local_stored_full_path = os.path.join(absPath, const.OBSTACK2_LOCAL_STORED_PATH)
upload_file(ssh_client, obstack2_local_stored_full_path, const.OBSTACK2_DEFAULT_INSTALL_PATH, self.context.stdio)
self.stdio.verbose("Installation of obstack2 is completed and gather begins ...")
Expand Down
2 changes: 1 addition & 1 deletion src/handler/gather/step/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def execute(self):
if "ssh" not in self.step:
self.stdio.error("SshHandler execute ssh is not set")
return
ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict)
ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict)
self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd))
ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd)
if ssh_report_value is None:
Expand Down
4 changes: 2 additions & 2 deletions src/handler/rca/rca_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ def __init__(self, context):
# init input parameters
self.report = None
self.tasks = None
self.context.set_variable("input_parameters", Util.get_option(self.options, "input_parameters"))
self.context.set_variable("env", Util.get_option(self.options, "input_parameters"))
self.context.set_variable("input_parameters", Util.get_option(self.options, "env"))
self.context.set_variable("env", Util.get_option(self.options, "env"))
self.store_dir = Util.get_option(self.options, "store_dir", "./obdiag_rca/")
self.context.set_variable("store_dir", self.store_dir)
self.stdio.verbose(
Expand Down

0 comments on commit abff0e4

Please sign in to comment.