Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[13.4-stable] Enhance memory-monitor with improved logging, cleanup, and task handling #4365

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/memory-monitor/src/monitor/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define HANDLER_SCRIPT "memory-monitor-handler.sh"

// The paths here should correspond to the paths in src/monitor/memory-monitor-handler.sh
#define LOG_DIR "output"
#define EVENT_LOG_FILE "events.log"
#define HANDLER_LOG_FILE "memory-monitor-handler.log"
Expand Down
128 changes: 68 additions & 60 deletions pkg/memory-monitor/src/monitor/memory-monitor-handler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,73 @@
set -x
set -e

# The paths here should correspond to the paths in the src/monitor/config.h
MEMORY_MONITOR_HANDLER_LOG_FILE="memory-monitor-handler.log"
EVENT_LOG_FILE="events.log"
PSI_FILE="psi.txt"

MAX_OUTPUT_SIZE_MB=1 # 100 MB
MAX_OUTPUT_SIZE_KB=$((MAX_OUTPUT_SIZE_MB * 1024))

tar_old_output() {
# Tar directory with previous output to save space, but keep the latest output as is for easy access
# It's necessary as one output directory takes around 15 Mb. In archive, it's compressed to 1-2 Mb.
for dir in */; do
if [ "$dir" != "$timestamp/" ]; then
#Remove / from the end of the directory name
tar_name=${dir%/}
find "$dir" -type f -print0 | tar -czf "$tar_name.tar.gz" --files-from=-
rm -rf "$dir"
fi
done
}

cleanup() {
# Disable the script debug messages, so that the caller of the script can print the
# last lines of the log file that most likely contain the error message
set +x

cd output
tar_old_output
# Clean up the temporary file
rm "$sorted_eve_processes"
rm "$sorted_pillar_processes"

# Remove old archives, do not keep more than 100 MB of archives
total_size=$(du -s | awk '{print $1}') # Size in KB
# Subtract the size of the handler log file and convert it to KB
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
# Subtract the size of the psi.txt file (if it exists) as it size is regulated by the PSICollector
if [ -f "$PSI_FILE" ]; then
total_size=$((total_size - $(stat -c %s "$PSI_FILE") / 1024))
fi
while [ "$total_size" -gt "$MAX_OUTPUT_SIZE_KB" ]; do
found_archives=$(find . -type f -name "*.tar.gz" -print | sort -n)
if [ -z "$found_archives" ]; then
break
fi
oldest_archive=$(echo "$found_archives" | head -n 1)
rm "$oldest_archive"
# Remove the first line from the events.log file: it contains the oldest event info
sed -i '1d' "$EVENT_LOG_FILE"
total_size=$(du -s | awk '{print $1}')
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
done
}

# Trap the cleanup function
trap cleanup EXIT

# Define the function to recursively process each cgroup
find_pids_of_cgroup() {
path=$1
tempfile=$2

# Get a copy of the list of tasks in the cgroup, not to block the cgroup while handling
tmp_tasks=$(mktemp)
cat "$path"/tasks > "$tmp_tasks"

# List all tasks, filter out unique PIDs
while read -r tid; do
if [ -z "$tid" ]; then
continue
fi
# Find the main PID for each TID
pid=$(awk '/^Tgid:/ {print $2}' "/proc/$tid/status" 2>/dev/null)
if [ -n "$pid" ]; then
# Get the PIDs of the cgroup
pids=$(cat "$path"/cgroup.procs)
for pid in $pids; do
echo "$pid" >> "$tempfile"
fi
done < "$tmp_tasks"

rm "$tmp_tasks"
done

# Recurse into subdirectories
for subdir in "$path"/*/; do
Expand Down Expand Up @@ -125,6 +170,12 @@ pillar_processes=$(mktemp)
sorted_eve_processes=$(mktemp)
sorted_pillar_processes=$(mktemp)

# Create the output directory if necessary
current_output_dir=$1
# Get the timestamp from the directory name (it's the last part of the path)
timestamp=$(basename "$current_output_dir")
mkdir -p "$current_output_dir"

# Process the cgroup and its subgroups
find_pids_of_cgroup "$cgroup_eve" "$eve_processes"
normalize_pids "$eve_processes" "$sorted_eve_processes"
Expand All @@ -138,12 +189,6 @@ rm "$pillar_processes"
# TODO How to deal with the older eve versions that do not support the debug command?
eve http-debug

# Create the output directory if necessary
current_output_dir=$1
# Get the timestamp from the directory name (it's the last part of the path)
timestamp=$(basename "$current_output_dir")
mkdir -p "$current_output_dir"

# ==== Handle the Pillar memory usage ====

show_pid_mem_usage "eve/services/pillar" "$sorted_pillar_processes" "$current_output_dir/memstat_pillar.out"
Expand All @@ -167,41 +212,4 @@ eve http-debug stop

ln -s /containers/services/pillar/rootfs/opt/zededa/bin/zedbox "$current_output_dir/zedbox"

# Clean up the temporary file
rm "$sorted_eve_processes"
rm "$sorted_pillar_processes"

# Tar directory with previous output to save space, but keep the latest output as is for easy access
# It's necessary as one output directory takes around 15 Mb. In archive, it's compressed to 1-2 Mb.
cd output || exit
for dir in */; do
if [ "$dir" != "$timestamp/" ]; then
#Remove / from the end of the directory name
tar_name=${dir%/}
find "$dir" -type f -print0 | tar -czf "$tar_name.tar.gz" --files-from=-
rm -rf "$dir"
fi
done

MEMORY_MONITOR_HANDLER_LOG_FILE="memory-monitor-handler.log"

# Remove old archives, do not keep more than 100 MB of archives
total_size=$(du -s | awk '{print $1}') # Size in KB
# Subtract the size of the handler log file
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
# Subtract the size of the psi.txt file (if it exists) as it size is regulated by the PSICollector
if [ -f psi.txt ]; then
total_size=$((total_size - $(stat -c %s psi.txt) / 1024))
fi
while [ "$total_size" -gt 102400 ]; do
found_archives=$(find . -type f -name "*.tar.gz" -print | sort -n)
if [ -z "$found_archives" ]; then
break
fi
oldest_archive=$(echo "$found_archives" | head -n 1)
rm "$oldest_archive"
# Remove the first line from the events.log file: it contains the oldest event info
sed -i '1d' events.log
total_size=$(du -s | awk '{print $1}')
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
done
# cleanup function will be called here automatically
9 changes: 9 additions & 0 deletions pkg/memory-monitor/src/monitor/monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ int run_handler(const char *script_name, const char *event_msg) {
pthread_mutex_unlock(&handler_mutex);
return 1;
}
} else {
// If the status is not 0, print several lines from the end of the log file
char *tail = get_tail(LOG_DIR "/" HANDLER_LOG_FILE, 10);
if (tail != NULL) {
syslog(LOG_ERR, "Handler script output (last 10 lines):\n%s\n", tail);
free(tail);
} else {
syslog(LOG_ERR, "Failed to read the handler log file\n");
}
}
} else {
syslog(LOG_INFO, "Handler script exited abnormally by signal %d\n", WTERMSIG(status));
Expand Down
63 changes: 63 additions & 0 deletions pkg/memory-monitor/src/monitor/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,66 @@ int convert_mb_to_bytes_signed(long mb, long *bytes_out) {
}
return 0;
}

char *get_tail(const char *file_path, size_t num_lines) {
// Open the file
FILE *file = fopen(file_path, "r");
if (file == NULL) {
syslog(LOG_ERR, "Failed to open file: %s", strerror(errno));
return NULL;
}

// Seek to the end of the file
fseek(file, 0, SEEK_END);
long file_size = ftell(file);
if (file_size == -1) {
syslog(LOG_ERR, "Failed to get file size: %s", strerror(errno));
fclose(file);
return NULL;
}

// Start reading the file backwards to find the last `num_lines` lines
int lines_read = 0;
long i;
char c;
for (i = file_size - 1; i >= 0; i--) {
fseek(file, i, SEEK_SET);
c = fgetc(file);
if (c == '\n') {
lines_read++;
if (lines_read == num_lines + 1) { // +1 to include the current line
break;
}
}
}

// If we didn't find enough lines, reset i to start of file
if (i < 0) {
i = 0;
} else {
i += 2; // Move past the '\n' we stopped at
}

// Allocate memory for the buffer to hold the result
size_t buffer_size = file_size - i + 1; // Include space for null terminator
char *buffer = (char *) malloc(buffer_size);
if (buffer == NULL) {
syslog(LOG_ERR, "Failed to allocate memory: %s", strerror(errno));
fclose(file);
return NULL;
}

// Read from the file from the correct position
fseek(file, i, SEEK_SET);
size_t bytes_read = fread(buffer, 1, buffer_size - 1, file);
if (bytes_read != buffer_size - 1) {
syslog(LOG_ERR, "Failed to read file: %s", strerror(errno));
free(buffer);
fclose(file);
return NULL;
}
buffer[buffer_size - 1] = '\0'; // Null-terminate the buffer

fclose(file);
return buffer;
}
1 change: 1 addition & 0 deletions pkg/memory-monitor/src/monitor/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ int get_eve_release(char *eve_release);
void log_event(const time_t *t, const char *format, ...);
long strtodec(const char *str, bool *error);
unsigned long strtoudec(const char *str, bool *error);
char* get_tail(const char *file_path, size_t lines);

#endif //MM_UTILS_H
Loading