From 366dccd187049cf539d708dfb5e4bb59f2d985ba Mon Sep 17 00:00:00 2001 From: "jingshun.tq" <35712518+Teingi@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:54:10 +0800 Subject: [PATCH 1/5] Rename Dockerfile to DockerFile --- build/{Dockerfile => DockerFile} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename build/{Dockerfile => DockerFile} (93%) diff --git a/build/Dockerfile b/build/DockerFile similarity index 93% rename from build/Dockerfile rename to build/DockerFile index 60898428..4a0d7c9f 100644 --- a/build/Dockerfile +++ b/build/DockerFile @@ -8,4 +8,4 @@ RUN /opt/miniconda3/bin/conda init RUN /opt/miniconda3/bin/conda create --name obdiag python=3.8 -y RUN source /opt/miniconda3/bin/activate obdiag RUN /opt/miniconda3/envs/obdiag/bin/python3.8 -m pip install --upgrade pip setuptools wheel -RUN yum install -y gcc gcc-c++ make \ No newline at end of file +RUN yum install -y gcc gcc-c++ make From 76de3127d13f4200a9488382a098c1e08e24cf71 Mon Sep 17 00:00:00 2001 From: Teingi Date: Wed, 25 Dec 2024 11:03:37 +0800 Subject: [PATCH 2/5] Focused efforts on addressing bugs identified in the 3.0.0 iteration --- .../observer/system/instruction_set_avx.yaml | 9 ++++++++ rpm/init.sh | 15 ++++++++++++- src/handler/gather/gather_component_log.py | 22 ++++++++++++++++--- 3 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 plugins/check/tasks/observer/system/instruction_set_avx.yaml diff --git a/plugins/check/tasks/observer/system/instruction_set_avx.yaml b/plugins/check/tasks/observer/system/instruction_set_avx.yaml new file mode 100644 index 00000000..2903caf3 --- /dev/null +++ b/plugins/check/tasks/observer/system/instruction_set_avx.yaml @@ -0,0 +1,9 @@ +info: "Check the flags of cpu" +task: + - steps: + - type: ssh + ssh: "lscpu |grep Flags" + result: + set_value: cpu_flags + verify: " [[ $cpu_flags == *avx* ]] " + err_msg: "observer need cpu support avx. If the cpu is not support avx, observer will be crash." diff --git a/rpm/init.sh b/rpm/init.sh index b6cc1dba..5520d2a0 100755 --- a/rpm/init.sh +++ b/rpm/init.sh @@ -1,5 +1,18 @@ #!/usr/bin/env bash +CURRENT_USER_ID=$(id -u) +CURRENT_USER_NAME=$(logname 2>/dev/null || echo "$SUDO_USER" | awk -F'[^a-zA-Z0-9_]' '{print $1}') + +if [ "$CURRENT_USER_ID" -eq 0 ]; then + if [ -n "$SUDO_USER" ]; then + USER_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6) + else + USER_HOME=/root + fi +else + USER_HOME="$HOME" +fi + if [[ $# == 1 && $1 == "-f" ]]; then FORCE_DEPLOY="1" else @@ -34,7 +47,7 @@ fi source ${WORK_DIR}/init_obdiag_cmd.sh cd - -output_file=${OBDIAG_HOME}/version.yaml +output_file=${USER_HOME}/version.yaml version_line=$(/usr/local/oceanbase-diagnostic-tool/obdiag --version 2>&1 | grep -oP 'OceanBase Diagnostic Tool: \K[\d.]+') if [ -n "$version_line" ]; then content="obdiag_version: \"$version_line\"" diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index ffa67f54..fd06d9e1 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -92,7 +92,7 @@ def init(self, context, *args, **kwargs): self.scope = "all" self.grep = kwargs.get('grep', None) self.store_dir = kwargs.get('store_dir', None) - self.temp_dir = kwargs.get('temp_dir', None) + self.temp_dir = kwargs.get('temp_dir', const.GATHER_LOG_TEMPORARY_DIR_DEFAULT) self.redact = kwargs.get('redact', None) self.nodes = kwargs.get('nodes', None) self.is_scene = kwargs.get('is_scene', False) @@ -103,7 +103,7 @@ def init(self, context, *args, **kwargs): # build config dict for gather log on node self.gather_log_conf_dict = { "target": self.target, - "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, + "tmp_dir": self.temp_dir, "scope": self.scope, "grep": self.grep, "store_dir": self.store_dir, @@ -399,7 +399,12 @@ def handle(self, result_list=None): self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir)) from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str)) - tmp_dir = "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target) + + tmp_dir = "{0}_log_{1}_{2}_{3}_{4}".format(self.target, self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6]) + if self.target == "observer" and self.ssh_client.get_name() == "local": + pid = self.__get_observer_pid(self.node) + if pid: + tmp_dir = "{0}_pid_{1}".format(tmp_dir, pid) tmp_log_dir = os.path.join(self.tmp_dir, tmp_dir) # mkdir tmp_log_dir self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir)) @@ -583,3 +588,14 @@ def __get_logfile_name_list(self, from_time_str, to_time_str, log_dir, log_files else: self.stdio.warn("No found the qualified log file on Server [{0}]".format(self.ssh_client.get_name())) return log_name_list + + def __get_observer_pid(self, node): + pid_file_path = os.path.join(node.get("home_path"), 'run', 'observer.pid') + try: + with open(pid_file_path, 'r') as file: + first_line = file.readline().strip() + return first_line + except FileNotFoundError: + self.stdio.exception(f"Error: The file {pid_file_path} does not exist.") + except Exception as e: + self.stdio.exception(f"An error occurred: {e}") From 23e2b62cd1ff86fba5126e7021987785715b4bae Mon Sep 17 00:00:00 2001 From: Teingi Date: Thu, 26 Dec 2024 14:20:44 +0800 Subject: [PATCH 3/5] 3.0.0 fix bug --- rpm/init.sh | 4 +- rpm/obdiag_backup.sh | 87 +++++++++++++++++++-------- src/common/core.py | 4 +- src/handler/gather/gather_obstack2.py | 2 +- 4 files changed, 66 insertions(+), 31 deletions(-) diff --git a/rpm/init.sh b/rpm/init.sh index 5520d2a0..a62286bc 100755 --- a/rpm/init.sh +++ b/rpm/init.sh @@ -24,7 +24,7 @@ WORK_DIR=$(readlink -f "$(dirname ${BASH_SOURCE[0]})") if [ ${OBDIAG_HOME} ]; then OBDIAG_HOME=${OBDIAG_HOME} else - OBDIAG_HOME="${HOME}/.obdiag" + OBDIAG_HOME="${USER_HOME}/.obdiag" fi mkdir -p ${OBDIAG_HOME} && cd ${OBDIAG_HOME} @@ -47,7 +47,7 @@ fi source ${WORK_DIR}/init_obdiag_cmd.sh cd - -output_file=${USER_HOME}/version.yaml +output_file=${OBDIAG_HOME}/version.yaml version_line=$(/usr/local/oceanbase-diagnostic-tool/obdiag --version 2>&1 | grep -oP 'OceanBase Diagnostic Tool: \K[\d.]+') if [ -n "$version_line" ]; then content="obdiag_version: \"$version_line\"" diff --git a/rpm/obdiag_backup.sh b/rpm/obdiag_backup.sh index 9320a0f0..12b72108 100755 --- a/rpm/obdiag_backup.sh +++ b/rpm/obdiag_backup.sh @@ -48,30 +48,31 @@ TIMESTAMP=$(date +"%Y%m%d%H%M%S") BASE_NAME="obdiag_backup${VERSION:+_v$VERSION}" TARFILE="$BACKUP_DIR/${BASE_NAME}_$TIMESTAMP.tar.gz" -# Check if a file with the same name already exists in the BACKUP_DIR +# Check if a file with the same base name already exists in the BACKUP_DIR if find "$BACKUP_DIR" -maxdepth 1 -name "${BASE_NAME}_*.tar.gz" -print -quit | grep -q .; then - echo "A backup file with the same name already exists. Skipping backup creation." - exit 0 + echo "A backup file with the same base name already exists. Skipping backup creation." + exit 0 fi -# Temporary directory for staging backup files +# Temporary directory for staging backup files, including top-level directory TEMP_BACKUP_DIR="$BACKUP_DIR/tmp_obdiag_backup_$TIMESTAMP" -mkdir -p "$TEMP_BACKUP_DIR" +TOP_LEVEL_DIR="$TEMP_BACKUP_DIR/obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP" # Top-level directory inside the tarball +mkdir -p "$TOP_LEVEL_DIR" # Iterate over each directory to be backed up for dir in "${DIRS[@]}"; do # Check if the source directory exists if [ -d "$SOURCE_DIR$dir" ]; then - # Copy the directory into the temporary backup directory - cp -rp "$SOURCE_DIR$dir" "$TEMP_BACKUP_DIR/" - echo "Copied $dir to temporary backup directory." + # Copy the directory into the top-level directory within the temporary backup directory + cp -rp "$SOURCE_DIR$dir" "$TOP_LEVEL_DIR/" + echo "Copied $dir to temporary backup directory under ${BASE_NAME}_$TIMESTAMP." else echo "Source directory $SOURCE_DIR$dir does not exist. Skipping." fi done -# Create a tar.gz archive -if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" .; then +# Create a tar.gz archive with the top-level directory included +if tar -czf "$TARFILE" -C "$TEMP_BACKUP_DIR" "obdiag_backup${VERSION:+_v$VERSION}_$TIMESTAMP"; then echo "Backup archive created successfully at $TARFILE" else echo "Failed to create backup archive." @@ -85,19 +86,53 @@ echo "Temporary files removed." # Cleanup phase: Remove backups older than one year or delete the oldest backups if more than 12 exist ONE_YEAR_AGO="+365" # find command uses days, so +365 means older than one year -# Remove backups older than one year -find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO -exec rm -f {} \; -echo "Removed old backup files older than one year." - -# If there are more than 12 backups, remove the excess oldest ones -BACKUP_FILES=($(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n)) -NUM_BACKUPS=${#BACKUP_FILES[@]} - -if [ $NUM_BACKUPS -gt 12 ]; then - COUNT_TO_DELETE=$((NUM_BACKUPS - 12)) - for ((i = 0; i < COUNT_TO_DELETE; i++)); do - FILE_PATH=${BACKUP_FILES[i]#* } - rm -f "$FILE_PATH" - echo "Removed excess backup file: $FILE_PATH" - done -fi \ No newline at end of file +# Function to remove a single oldest backup file and print the action +remove_oldest_backup() { + BACKUP_FILE=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -printf '%T+ %p\n' | sort | head -n 1 | cut -d ' ' -f2-) + if [ -n "$BACKUP_FILE" ]; then + echo "Attempting to remove oldest backup file: $BACKUP_FILE" + if rm -f "$BACKUP_FILE"; then + echo "Successfully removed oldest backup file: $BACKUP_FILE" + return 0 + else + echo "Failed to remove oldest backup file: $BACKUP_FILE" + return 1 + fi + else + echo "No backup files found." + return 1 + fi +} + +# Function to check if there are backups older than one year +has_old_backups() { + if find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f -mtime $ONE_YEAR_AGO | grep -q .; then + echo "Found old backup files." + return 0 + else + echo "No old backup files found." + return 1 + fi +} + +# Function to check if there are more than 12 backups +has_too_many_backups() { + COUNT=$(find "$BACKUP_DIR" -maxdepth 1 -name "obdiag_backup_*.tar.gz" -type f | wc -l) + if [ $COUNT -gt 12 ]; then + echo "More than 12 backup files found: $COUNT" + return 0 + else + echo "Backup count within limit: $COUNT" + return 1 + fi +} + +# Cleanup loop: Remove only one file at a time until neither condition is met +echo "Starting cleanup process..." +while has_old_backups || has_too_many_backups; do + if ! remove_oldest_backup; then + echo "Cleanup process stopped due to failure in removing oldest backup." + break # Stop if no more files to remove or removal failed + fi +done +echo "Cleanup process completed." \ No newline at end of file diff --git a/src/common/core.py b/src/common/core.py index 52c43c98..6604d006 100644 --- a/src/common/core.py +++ b/src/common/core.py @@ -316,7 +316,7 @@ def gather_function(self, function_type, opt): since=Util.get_option(options, 'since'), grep=Util.get_option(options, 'grep'), store_dir=Util.get_option(options, 'store_dir'), - temp_dir=Util.get_option(options, 'temp_dir'), + temp_dir="/tmp", redact=Util.get_option(options, 'redact'), ) return handler_obproxy.handle() @@ -360,7 +360,7 @@ def gather_obproxy_log(self, opt): scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), store_dir=Util.get_option(options, 'store_dir'), - temp_dir=Util.get_option(options, 'temp_dir'), + temp_dir="/tmp", redact=Util.get_option(options, 'redact'), ) return handler.handle() diff --git a/src/handler/gather/gather_obstack2.py b/src/handler/gather/gather_obstack2.py index a43cbdff..29315bd0 100644 --- a/src/handler/gather/gather_obstack2.py +++ b/src/handler/gather/gather_obstack2.py @@ -142,7 +142,7 @@ def __handle_from_node(self, local_stored_path, node): if getattr(sys, 'frozen', False): absPath = os.path.dirname(sys.executable) else: - absPath = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + absPath = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) obstack2_local_stored_full_path = os.path.join(absPath, const.OBSTACK2_LOCAL_STORED_PATH) upload_file(ssh_client, obstack2_local_stored_full_path, const.OBSTACK2_DEFAULT_INSTALL_PATH, self.context.stdio) self.stdio.verbose("Installation of obstack2 is completed and gather begins ...") From 4ccfdd4f6be95224c111e0e6dec93523a3d7a908 Mon Sep 17 00:00:00 2001 From: Teingi Date: Mon, 30 Dec 2024 00:00:49 +0800 Subject: [PATCH 4/5] fix --- src/handler/analyzer/analyze_memory.py | 9 ++++++--- src/handler/rca/rca_handler.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/handler/analyzer/analyze_memory.py b/src/handler/analyzer/analyze_memory.py index 363795f4..08afc514 100644 --- a/src/handler/analyzer/analyze_memory.py +++ b/src/handler/analyzer/analyze_memory.py @@ -645,13 +645,16 @@ def __parse_log_lines(self, file_full_path, memory_dict): if '[MEMORY]' in line or 'MemDump' in line or 'ob_tenant_ctx_allocator' in line: if '[MEMORY] tenant:' in line: tenant_id = line.split('tenant:')[1].split(',')[0].strip() - hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip() - rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip() + if 'rpc_' in line: + hold_bytes = line.split('hold:')[1].split('rpc_')[0].strip() + rpc_hold_bytes = line.split('rpc_hold:')[1].split('cache_hold')[0].strip() + tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes) + else: + hold_bytes = line.split('hold:')[1].split('cache_')[0].strip() cache_hold_bytes = line.split('cache_hold:')[1].split('cache_used')[0].strip() cache_used_bytes = line.split('cache_used:')[1].split('cache_item_count')[0].strip() cache_item_count = line.split('cache_item_count:')[1].strip() tenant_dict['hold'] = self.__convert_string_bytes_2_int_bytes(hold_bytes) - tenant_dict['rpc_hold'] = self.__convert_string_bytes_2_int_bytes(rpc_hold_bytes) tenant_dict['cache_hold'] = self.__convert_string_bytes_2_int_bytes(cache_hold_bytes) tenant_dict['cache_used'] = self.__convert_string_bytes_2_int_bytes(cache_used_bytes) tenant_dict['cache_item_count'] = self.__convert_string_bytes_2_int_bytes(cache_item_count) diff --git a/src/handler/rca/rca_handler.py b/src/handler/rca/rca_handler.py index 240cf581..beba179f 100644 --- a/src/handler/rca/rca_handler.py +++ b/src/handler/rca/rca_handler.py @@ -137,8 +137,8 @@ def __init__(self, context): # init input parameters self.report = None self.tasks = None - self.context.set_variable("input_parameters", Util.get_option(self.options, "input_parameters")) - self.context.set_variable("env", Util.get_option(self.options, "input_parameters")) + self.context.set_variable("input_parameters", Util.get_option(self.options, "env")) + self.context.set_variable("env", Util.get_option(self.options, "env")) self.store_dir = Util.get_option(self.options, "store_dir", "./obdiag_rca/") self.context.set_variable("store_dir", self.store_dir) self.stdio.verbose( From fdfff13ada56667e4fd2714f4c80d10eebaa4f9f Mon Sep 17 00:00:00 2001 From: Teingi Date: Tue, 31 Dec 2024 10:29:44 +0800 Subject: [PATCH 5/5] fix --- src/common/tool.py | 12 ------------ src/handler/display/step/ssh.py | 2 +- src/handler/gather/step/ssh.py | 2 +- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/common/tool.py b/src/common/tool.py index beb0f69f..f4a6ea77 100644 --- a/src/common/tool.py +++ b/src/common/tool.py @@ -1327,18 +1327,6 @@ def replacer(match): return re.sub(r'#\{(\w+)\}', replacer, s) - @staticmethod - def build_str_on_expr_by_dict_2(expr, variable_dict, stdio=None): - s = expr - d = variable_dict - - def replacer(match): - key = match.group(1) - value = str(d.get(key, match.group(0))) - return f"{value}" - - return re.sub(r'\$\{(\w+)\}', replacer, s) - @staticmethod def build_sql_on_expr_by_dict(expr, variable_dict, stdio=None): s = expr diff --git a/src/handler/display/step/ssh.py b/src/handler/display/step/ssh.py index 4dcc377a..b63a5c48 100644 --- a/src/handler/display/step/ssh.py +++ b/src/handler/display/step/ssh.py @@ -44,7 +44,7 @@ def execute(self): if "ssh" not in self.step: self.stdio.error("SshHandler execute ssh is not set") return - ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict) + ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict) self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd)) ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd) if ssh_report_value is None: diff --git a/src/handler/gather/step/ssh.py b/src/handler/gather/step/ssh.py index 71655262..1d7aaf8c 100644 --- a/src/handler/gather/step/ssh.py +++ b/src/handler/gather/step/ssh.py @@ -44,7 +44,7 @@ def execute(self): if "ssh" not in self.step: self.stdio.error("SshHandler execute ssh is not set") return - ssh_cmd = StringUtils.build_str_on_expr_by_dict_2(self.step["ssh"], self.task_variable_dict) + ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict) self.stdio.verbose("step SshHandler execute :{0} ".format(ssh_cmd)) ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd) if ssh_report_value is None: