From 41c813cb961fe33652c57f388cd90aab8f1c41e9 Mon Sep 17 00:00:00 2001 From: Hans Aschenloher Date: Tue, 11 Jan 2022 10:17:22 +0100 Subject: [PATCH 1/7] add BGL dataloader --- loglizer/dataloader.py | 77 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py index ae04778..78a229c 100644 --- a/loglizer/dataloader.py +++ b/loglizer/dataloader.py @@ -7,6 +7,7 @@ """ +import random import pandas as pd import os import numpy as np @@ -162,12 +163,82 @@ def slice_hdfs(x, y, window_size): -def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60, +def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, train_ratio=0.8): - """ TODO + """ Read a BGL log file to obtain training and test data. + + Args: + -------- + log_file: Input file name with the header + "LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for + unnecessary fields. + windows: Type of windows to use. Can be either 'sliding' or 'fixed'. + time_interval: Time scope of a window in seconds. Used for both fixed and + sliding windows. + stepping_size: Step size of sliding windows in seconds. Used only for + sliding windows. + train_ratio: Fraction of examples to use for training. + + Returns + ------- + (x_train, y_train): The training data. + (x_test, y_test): The testing data. """ + # Load the file and sort lines according to time. + df = pd.read_csv(log_file) + df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f") + df = df.sort_values(by="Time") + df.reset_index(drop=True, inplace=True) + df['LineId'] = range(0, df.shape[0]) + + examples = [] # List of sequences and anomaly labels. + + start_time = df['Timestamp'][0] + end_time = df['Timestamp'].iloc[-1] + + assert window == 'fixed' or window == 'sliding', "Unsupported window." + index = 0 + t0 = start_time + t1 = t0 + time_interval + while t1 < end_time: + sequence = [] + is_anomaly = 0 + # Make a sequence and label it as normal or abnormal. + while df['Timestamp'][index] < t1: + sequence.append(df['EventId'][index]) + if df['Label'][index] != '-': + is_anomaly = 1 + index += 1 + if sequence: + examples.append([sequence, is_anomaly]) + # Translate the window. + if window == "fixed": + t0 = t1 + elif window == "sliding": + t0 += stepping_size + t1 = t0 + time_interval + + random.shuffle(examples) + x = [t[0] for t in examples] + y = [t[1] for t in examples] + + n_train = int(len(x) * train_ratio) + + x_train = np.array(x[:n_train], dtype=list) + y_train = np.array(y[:n_train], dtype=int) + x_test = np.array(x[n_train:], dtype=list) + y_test = np.array(y[n_train:], dtype=int) + + print('Total: {} instances, {} anomaly, {} normal' \ + .format(len(y), sum(y), len(y) - sum(y))) + print('Train: {} instances, {} anomaly, {} normal' \ + .format(len(y_train), sum(y_train), len(y_train) - sum(y_train))) + print('Test: {} instances, {} anomaly, {} normal' \ + .format(len(y_test), sum(y_test), len(y_test) - sum(y_test))) + + return (x_train, y_train), (x_test, y_test) def bgl_preprocess_data(para, raw_data, event_mapping_data): """ split logs into sliding windows, built an event count matrix and get the corresponding label @@ -265,4 +336,4 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): assert inst_number == len(labels) print("Among all instances, %d are anomalies"%sum(labels)) assert event_count_matrix.shape[0] == len(labels) - return event_count_matrix, labels + return event_count_matrix, labels \ No newline at end of file From 92df1217aa0233164826150e9e5889dffe77bb90 Mon Sep 17 00:00:00 2001 From: Hans Aschenloher Date: Tue, 11 Jan 2022 10:51:21 +0100 Subject: [PATCH 2/7] Refactor dataloders into seperate files --- loglizer/{dataloader.py => dataloader/BGL.py} | 188 +++--------------- loglizer/dataloader/HDFS.py | 163 +++++++++++++++ 2 files changed, 193 insertions(+), 158 deletions(-) rename loglizer/{dataloader.py => dataloader/BGL.py} (55%) create mode 100644 loglizer/dataloader/HDFS.py diff --git a/loglizer/dataloader.py b/loglizer/dataloader/BGL.py similarity index 55% rename from loglizer/dataloader.py rename to loglizer/dataloader/BGL.py index 78a229c..c683e13 100644 --- a/loglizer/dataloader.py +++ b/loglizer/dataloader/BGL.py @@ -1,10 +1,9 @@ """ -The interface to load log datasets. The datasets currently supported include -HDFS and BGL. +The interface to load HDFS log datasets. Authors: - LogPAI Team - + Vincent-Therrien + Hans Aschenloher """ import random @@ -15,154 +14,6 @@ from sklearn.utils import shuffle from collections import OrderedDict -def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'): - if split_type == 'uniform' and y_data is not None: - pos_idx = y_data > 0 - x_pos = x_data[pos_idx] - y_pos = y_data[pos_idx] - x_neg = x_data[~pos_idx] - y_neg = y_data[~pos_idx] - train_pos = int(train_ratio * x_pos.shape[0]) - train_neg = int(train_ratio * x_neg.shape[0]) - x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]]) - y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]]) - x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]]) - y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]]) - elif split_type == 'sequential': - num_train = int(train_ratio * x_data.shape[0]) - x_train = x_data[0:num_train] - x_test = x_data[num_train:] - if y_data is None: - y_train = None - y_test = None - else: - y_train = y_data[0:num_train] - y_test = y_data[num_train:] - # Random shuffle - indexes = shuffle(np.arange(x_train.shape[0])) - x_train = x_train[indexes] - if y_train is not None: - y_train = y_train[indexes] - return (x_train, y_train), (x_test, y_test) - -def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0): - """ Load HDFS structured log into train and test data - - Arguments - --------- - log_file: str, the file path of structured log. - label_file: str, the file path of anomaly labels, None for unlabeled data - window: str, the window options including `session` (default). - train_ratio: float, the ratio of training data for train/test split. - split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means - to split positive samples and negative samples equally when setting label_file. `sequential` - means to split the data sequentially without label_file. That is, the first part is for training, - while the second part is for testing. - - Returns - ------- - (x_train, y_train): the training data - (x_test, y_test): the testing data - """ - - print('====== Input data summary ======') - - if log_file.endswith('.npz'): - # Split training and validation set in a class-uniform way - data = np.load(log_file) - x_data = data['x_data'] - y_data = data['y_data'] - (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type) - - elif log_file.endswith('.csv'): - assert window == 'session', "Only window=session is supported for HDFS dataset." - print("Loading", log_file) - struct_log = pd.read_csv(log_file, engine='c', - na_filter=False, memory_map=True) - data_dict = OrderedDict() - for idx, row in struct_log.iterrows(): - blkId_list = re.findall(r'(blk_-?\d+)', row['Content']) - blkId_set = set(blkId_list) - for blk_Id in blkId_set: - if not blk_Id in data_dict: - data_dict[blk_Id] = [] - data_dict[blk_Id].append(row['EventId']) - data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence']) - - if label_file: - # Split training and validation set in a class-uniform way - label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True) - label_data = label_data.set_index('BlockId') - label_dict = label_data['Label'].to_dict() - data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0) - - # Split train and test data - (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, - data_df['Label'].values, train_ratio, split_type) - - print(y_train.sum(), y_test.sum()) - - if save_csv: - data_df.to_csv('data_instances.csv', index=False) - - if window_size > 0: - x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size) - x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size) - log = "{} {} windows ({}/{} anomaly), {}/{} normal" - print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0])) - print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0])) - return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) - - if label_file is None: - if split_type == 'uniform': - split_type = 'sequential' - print('Warning: Only split_type=sequential is supported \ - if label_file=None.'.format(split_type)) - # Split training and validation set sequentially - x_data = data_df['EventSequence'].values - (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type) - print('Total: {} instances, train: {} instances, test: {} instances'.format( - x_data.shape[0], x_train.shape[0], x_test.shape[0])) - return (x_train, None), (x_test, None), data_df - else: - raise NotImplementedError('load_HDFS() only support csv and npz files!') - - num_train = x_train.shape[0] - num_test = x_test.shape[0] - num_total = num_train + num_test - num_train_pos = sum(y_train) - num_test_pos = sum(y_test) - num_pos = num_train_pos + num_test_pos - - print('Total: {} instances, {} anomaly, {} normal' \ - .format(num_total, num_pos, num_total - num_pos)) - print('Train: {} instances, {} anomaly, {} normal' \ - .format(num_train, num_train_pos, num_train - num_train_pos)) - print('Test: {} instances, {} anomaly, {} normal\n' \ - .format(num_test, num_test_pos, num_test - num_test_pos)) - - return (x_train, y_train), (x_test, y_test) - -def slice_hdfs(x, y, window_size): - results_data = [] - print("Slicing {} sessions, with window {}".format(x.shape[0], window_size)) - for idx, sequence in enumerate(x): - seqlen = len(sequence) - i = 0 - while (i + window_size) < seqlen: - slice = sequence[i: i + window_size] - results_data.append([idx, slice, sequence[i + window_size], y[idx]]) - i += 1 - else: - slice = sequence[i: i + window_size] - slice += ["#Pad"] * (window_size - len(slice)) - results_data.append([idx, slice, "#Pad", y[idx]]) - results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"]) - print("Slicing done, {} windows generated".format(results_df.shape[0])) - return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"] - - - def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, train_ratio=0.8): """ Read a BGL log file to obtain training and test data. @@ -191,10 +42,10 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f") df = df.sort_values(by="Time") df.reset_index(drop=True, inplace=True) - df['LineId'] = range(0, df.shape[0]) + df['LineId'] = range(0, df.shape[0]) examples = [] # List of sequences and anomaly labels. - + start_time = df['Timestamp'][0] end_time = df['Timestamp'].iloc[-1] @@ -219,11 +70,11 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, elif window == "sliding": t0 += stepping_size t1 = t0 + time_interval - + random.shuffle(examples) x = [t[0] for t in examples] y = [t[1] for t in examples] - + n_train = int(len(x) * train_ratio) x_train = np.array(x[:n_train], dtype=list) @@ -237,9 +88,30 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, .format(len(y_train), sum(y_train), len(y_train) - sum(y_train))) print('Test: {} instances, {} anomaly, {} normal' \ .format(len(y_test), sum(y_test), len(y_test) - sum(y_test))) - + return (x_train, y_train), (x_test, y_test) + +def slice_hdfs(x, y, window_size): + results_data = [] + print("Slicing {} sessions, with window {}".format(x.shape[0], window_size)) + for idx, sequence in enumerate(x): + seqlen = len(sequence) + i = 0 + while (i + window_size) < seqlen: + slice = sequence[i: i + window_size] + results_data.append([idx, slice, sequence[i + window_size], y[idx]]) + i += 1 + else: + slice = sequence[i: i + window_size] + slice += ["#Pad"] * (window_size - len(slice)) + results_data.append([idx, slice, "#Pad", y[idx]]) + results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"]) + print("Slicing done, {} windows generated".format(results_df.shape[0])) + return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"] + + + def bgl_preprocess_data(para, raw_data, event_mapping_data): """ split logs into sliding windows, built an event count matrix and get the corresponding label @@ -336,4 +208,4 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): assert inst_number == len(labels) print("Among all instances, %d are anomalies"%sum(labels)) assert event_count_matrix.shape[0] == len(labels) - return event_count_matrix, labels \ No newline at end of file + return event_count_matrix, labels diff --git a/loglizer/dataloader/HDFS.py b/loglizer/dataloader/HDFS.py new file mode 100644 index 0000000..157d9f7 --- /dev/null +++ b/loglizer/dataloader/HDFS.py @@ -0,0 +1,163 @@ +""" +The interface to load HDFS log datasets. + +Authors: + LogPAI Team + Hans Aschenloher +""" + +import random +import pandas as pd +import os +import numpy as np +import re +from sklearn.utils import shuffle +from collections import OrderedDict + +def loadDataset(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0): + """ Load HDFS structured log into train and test data + + Arguments + --------- + log_file: str, the file path of structured log. + label_file: str, the file path of anomaly labels, None for unlabeled data + window: str, the window options including `session` (default). + train_ratio: float, the ratio of training data for train/test split. + split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means + to split positive samples and negative samples equally when setting label_file. `sequential` + means to split the data sequentially without label_file. That is, the first part is for training, + while the second part is for testing. + + Returns + ------- + (x_train, y_train): the training data + (x_test, y_test): the testing data + """ + + print('====== Input data summary ======') + + if log_file.endswith('.npz'): + # Split training and validation set in a class-uniform way + data = np.load(log_file) + x_data = data['x_data'] + y_data = data['y_data'] + (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type) + + elif log_file.endswith('.csv'): + assert window == 'session', "Only window=session is supported for HDFS dataset." + print("Loading", log_file) + struct_log = pd.read_csv(log_file, engine='c', + na_filter=False, memory_map=True) + data_dict = OrderedDict() + for idx, row in struct_log.iterrows(): + blkId_list = re.findall(r'(blk_-?\d+)', row['Content']) + blkId_set = set(blkId_list) + for blk_Id in blkId_set: + if not blk_Id in data_dict: + data_dict[blk_Id] = [] + data_dict[blk_Id].append(row['EventId']) + data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence']) + + if label_file: + # Split training and validation set in a class-uniform way + label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True) + label_data = label_data.set_index('BlockId') + label_dict = label_data['Label'].to_dict() + data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0) + + # Split train and test data + (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, + data_df['Label'].values, train_ratio, split_type) + + print(y_train.sum(), y_test.sum()) + + if save_csv: + data_df.to_csv('data_instances.csv', index=False) + + if window_size > 0: + x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size) + x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size) + log = "{} {} windows ({}/{} anomaly), {}/{} normal" + print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0])) + print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0])) + return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) + + if label_file is None: + if split_type == 'uniform': + split_type = 'sequential' + print('Warning: Only split_type=sequential is supported \ + if label_file=None.'.format(split_type)) + # Split training and validation set sequentially + x_data = data_df['EventSequence'].values + (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type) + print('Total: {} instances, train: {} instances, test: {} instances'.format( + x_data.shape[0], x_train.shape[0], x_test.shape[0])) + return (x_train, None), (x_test, None), data_df + else: + raise NotImplementedError('load_HDFS() only support csv and npz files!') + + num_train = x_train.shape[0] + num_test = x_test.shape[0] + num_total = num_train + num_test + num_train_pos = sum(y_train) + num_test_pos = sum(y_test) + num_pos = num_train_pos + num_test_pos + + print('Total: {} instances, {} anomaly, {} normal' \ + .format(num_total, num_pos, num_total - num_pos)) + print('Train: {} instances, {} anomaly, {} normal' \ + .format(num_train, num_train_pos, num_train - num_train_pos)) + print('Test: {} instances, {} anomaly, {} normal\n' \ + .format(num_test, num_test_pos, num_test - num_test_pos)) + + return (x_train, y_train), (x_test, y_test) + +def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'): + if split_type == 'uniform' and y_data is not None: + pos_idx = y_data > 0 + x_pos = x_data[pos_idx] + y_pos = y_data[pos_idx] + x_neg = x_data[~pos_idx] + y_neg = y_data[~pos_idx] + train_pos = int(train_ratio * x_pos.shape[0]) + train_neg = int(train_ratio * x_neg.shape[0]) + x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]]) + y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]]) + x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]]) + y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]]) + elif split_type == 'sequential': + num_train = int(train_ratio * x_data.shape[0]) + x_train = x_data[0:num_train] + x_test = x_data[num_train:] + if y_data is None: + y_train = None + y_test = None + else: + y_train = y_data[0:num_train] + y_test = y_data[num_train:] + # Random shuffle + indexes = shuffle(np.arange(x_train.shape[0])) + x_train = x_train[indexes] + if y_train is not None: + y_train = y_train[indexes] + return (x_train, y_train), (x_test, y_test) + + +def slice_hdfs(x, y, window_size): + results_data = [] + print("Slicing {} sessions, with window {}".format(x.shape[0], window_size)) + for idx, sequence in enumerate(x): + seqlen = len(sequence) + i = 0 + while (i + window_size) < seqlen: + slice = sequence[i: i + window_size] + results_data.append([idx, slice, sequence[i + window_size], y[idx]]) + i += 1 + else: + slice = sequence[i: i + window_size] + slice += ["#Pad"] * (window_size - len(slice)) + results_data.append([idx, slice, "#Pad", y[idx]]) + results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"]) + print("Slicing done, {} windows generated".format(results_df.shape[0])) + return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"] + From f8548709d2ecbaf5abdf0e79ba9439b520d1593c Mon Sep 17 00:00:00 2001 From: Hans Aschenloher Date: Tue, 11 Jan 2022 11:01:30 +0100 Subject: [PATCH 3/7] Update the demos in respect to the dataloder changes --- demo/DecisionTree_demo.py | 8 +++++--- demo/DeepLog_demo.py | 9 +++++---- demo/InvariantsMiner_demo.py | 10 ++++++---- demo/InvariantsMiner_demo_without_labels.py | 16 ++++++++------- demo/IsolationForest_demo.py | 9 +++++---- demo/LR_demo.py | 8 +++++--- demo/LogClustering_demo.py | 10 ++++++---- demo/PCA_demo.py | 12 ++++++----- demo/PCA_demo_without_labels.py | 22 +++++++++++---------- demo/SVM_demo.py | 8 +++++--- 10 files changed, 65 insertions(+), 47 deletions(-) diff --git a/demo/DecisionTree_demo.py b/demo/DecisionTree_demo.py index 387b186..0c9d46e 100644 --- a/demo/DecisionTree_demo.py +++ b/demo/DecisionTree_demo.py @@ -4,15 +4,17 @@ import sys sys.path.append('../') from loglizer.models import DecisionTree -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') diff --git a/demo/DeepLog_demo.py b/demo/DeepLog_demo.py index b450ec9..28dd28c 100644 --- a/demo/DeepLog_demo.py +++ b/demo/DeepLog_demo.py @@ -6,7 +6,8 @@ from loglizer import dataloader from loglizer.models import DeepLog from loglizer.preprocessing import Vectorizer, Iterator - +from loglizer import preprocessing +from loglizer.dataloader import HDFS batch_size = 32 hidden_size = 32 @@ -16,14 +17,14 @@ window_size = 10 epoches = 2 num_workers = 2 -device = 0 +device = 0 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': - (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform') - + (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform') + feature_extractor = Vectorizer() train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train) test_dataset = feature_extractor.transform(x_test, window_y_test, y_test) diff --git a/demo/InvariantsMiner_demo.py b/demo/InvariantsMiner_demo.py index 05a7c61..bde0c68 100644 --- a/demo/InvariantsMiner_demo.py +++ b/demo/InvariantsMiner_demo.py @@ -4,16 +4,18 @@ import sys sys.path.append('../') from loglizer.models import InvariantsMiner -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file epsilon = 0.5 # threshold for estimating invariant space if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='sequential') feature_extractor = preprocessing.FeatureExtractor() @@ -25,7 +27,7 @@ print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) - + print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test) diff --git a/demo/InvariantsMiner_demo_without_labels.py b/demo/InvariantsMiner_demo_without_labels.py index 47a9e6c..876757e 100644 --- a/demo/InvariantsMiner_demo_without_labels.py +++ b/demo/InvariantsMiner_demo_without_labels.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- ''' This is a demo file for the Invariants Mining model. API usage: - dataloader.load_HDFS(): load HDFS dataset + HDFS.loadDataset(): load HDFS dataset feature_extractor.fit_transform(): fit and transform features feature_extractor.transform(): feature transform after fitting model.fit(): fit the model @@ -13,7 +13,9 @@ import sys sys.path.append('../') from loglizer.models import InvariantsMiner -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file @@ -21,8 +23,8 @@ if __name__ == '__main__': # Load structured log without label info - (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log, - window='session', + (x_train, _), (x_test, _) = HDFS.loadDataset(struct_log, + window='session', train_ratio=0.5, split_type='sequential') # Feature extraction @@ -43,11 +45,11 @@ # If you have labeled data, you can evaluate the accuracy of the model as well. # Load structured log with label info - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, - split_type='sequential') + split_type='sequential') x_test = feature_extractor.transform(x_test) precision, recall, f1 = model.evaluate(x_test, y_test) diff --git a/demo/IsolationForest_demo.py b/demo/IsolationForest_demo.py index e2cb63d..274eea2 100644 --- a/demo/IsolationForest_demo.py +++ b/demo/IsolationForest_demo.py @@ -4,16 +4,17 @@ import sys sys.path.append('../') from loglizer.models import IsolationForest -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file anomaly_ratio = 0.03 # Estimate the ratio of anomaly samples in the data if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') feature_extractor = preprocessing.FeatureExtractor() @@ -25,7 +26,7 @@ print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) - + print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test) diff --git a/demo/LR_demo.py b/demo/LR_demo.py index 56ee5ce..0da1a52 100644 --- a/demo/LR_demo.py +++ b/demo/LR_demo.py @@ -4,15 +4,17 @@ import sys sys.path.append('../') from loglizer.models import LR -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') diff --git a/demo/LogClustering_demo.py b/demo/LogClustering_demo.py index 0c82a09..d3dc852 100644 --- a/demo/LogClustering_demo.py +++ b/demo/LogClustering_demo.py @@ -4,7 +4,9 @@ import sys sys.path.append('../') from loglizer.models import LogClustering -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file @@ -12,9 +14,9 @@ anomaly_threshold = 0.3 # the threshold for anomaly detection if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') feature_extractor = preprocessing.FeatureExtractor() @@ -26,6 +28,6 @@ print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) - + print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test) diff --git a/demo/PCA_demo.py b/demo/PCA_demo.py index 74a1892..7d47a59 100644 --- a/demo/PCA_demo.py +++ b/demo/PCA_demo.py @@ -4,19 +4,21 @@ import sys sys.path.append('../') from loglizer.models import PCA -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') feature_extractor = preprocessing.FeatureExtractor() - x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', + x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') x_test = feature_extractor.transform(x_test) @@ -25,6 +27,6 @@ print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) - + print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test) diff --git a/demo/PCA_demo_without_labels.py b/demo/PCA_demo_without_labels.py index d54a1c0..6ada2f3 100644 --- a/demo/PCA_demo_without_labels.py +++ b/demo/PCA_demo_without_labels.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- ''' This is a demo file for the PCA model. API usage: - dataloader.load_HDFS(): load HDFS dataset + HDFS.loadDataset(): load HDFS dataset feature_extractor.fit_transform(): fit and transform features feature_extractor.transform(): feature transform after fitting model.fit(): fit the model @@ -13,36 +13,38 @@ import sys sys.path.append('../') from loglizer.models import PCA -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': ## 1. Load strutured log file and extract feature vectors # Save the raw event sequence file by setting save_csv=True - (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', + (x_train, _), (_, _) = HDFS.loadDataset(struct_log, window='session', split_type='sequential', save_csv=True) feature_extractor = preprocessing.FeatureExtractor() - x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', + x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') - + ## 2. Train an unsupervised model print('Train phase:') # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner - model = PCA() + model = PCA() # Model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) # Make predictions and manually check for correctness. Details may need to go into the raw logs - y_train = model.predict(x_train) + y_train = model.predict(x_train) ## 3. Use the trained model for online anomaly detection print('Test phase:') # Load another new log file. Here we use struct_log for demo only - (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential') + (x_test, _), (_, _) = HDFS.loadDataset(struct_log, window='session', split_type='sequential') # Go through the same feature extraction process with training, using transform() instead - x_test = feature_extractor.transform(x_test) + x_test = feature_extractor.transform(x_test) # Finally make predictions and alter on anomaly cases y_test = model.predict(x_test) - + diff --git a/demo/SVM_demo.py b/demo/SVM_demo.py index 088ffe7..e6cb76f 100644 --- a/demo/SVM_demo.py +++ b/demo/SVM_demo.py @@ -4,15 +4,17 @@ import sys sys.path.append('../') from loglizer.models import SVM -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': - (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, + (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, - window='session', + window='session', train_ratio=0.5, split_type='uniform') From 66c8cd8820460282cf02c9a4c8560493195e81f2 Mon Sep 17 00:00:00 2001 From: Hans Aschenloher Date: Tue, 11 Jan 2022 15:30:46 +0100 Subject: [PATCH 4/7] Refactor load_BGL mehtod name --- loglizer/dataloader/BGL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loglizer/dataloader/BGL.py b/loglizer/dataloader/BGL.py index c683e13..9f7a738 100644 --- a/loglizer/dataloader/BGL.py +++ b/loglizer/dataloader/BGL.py @@ -14,7 +14,7 @@ from sklearn.utils import shuffle from collections import OrderedDict -def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30, +def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30, train_ratio=0.8): """ Read a BGL log file to obtain training and test data. From a0cce9e32b4bb83fda7ee4e8bee3d8ed1ab5e3f9 Mon Sep 17 00:00:00 2001 From: Hans Aschenloher Date: Tue, 8 Feb 2022 23:09:58 +0100 Subject: [PATCH 5/7] Add a thunderbird dataloader --- loglizer/dataloader/Thunderbird.py | 189 +++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 loglizer/dataloader/Thunderbird.py diff --git a/loglizer/dataloader/Thunderbird.py b/loglizer/dataloader/Thunderbird.py new file mode 100644 index 0000000..6497ad0 --- /dev/null +++ b/loglizer/dataloader/Thunderbird.py @@ -0,0 +1,189 @@ +""" +The interface to load HDFS log datasets. + +Authors: + Hans Aschenloher +""" + +import random +import pandas as pd +import os +import numpy as np +import re +from sklearn.utils import shuffle +from collections import OrderedDict + +def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30, + train_ratio=0.8): + """ Read a Thunderbird log file to obtain training and test data. + + Args: + -------- + log_file: Input file name with the header + "LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for + unnecessary fields. + windows: Type of windows to use. Can be either 'sliding' or 'fixed'. + time_interval: Time scope of a window in seconds. Used for both fixed and + sliding windows. + stepping_size: Step size of sliding windows in seconds. Used only for + sliding windows. + train_ratio: Fraction of examples to use for training. + + Returns + ------- + (x_train, y_train): The training data. + (x_test, y_test): The testing data. + + """ + + # Load the file and sort lines according to time. + df = pd.read_csv(log_file) + #df['Time'] = pd.to_datetime( str(df['Month'])+" " + str(df['Day']) + " " + str(df['Time']), format="%b %d %H:%M:%S") + df = df.sort_values(by="Timestamp") + df.reset_index(drop=True, inplace=True) + df['LineId'] = range(0, df.shape[0]) + + examples = [] # List of sequences and anomaly labels. + + start_time = df['Timestamp'][0] + end_time = df['Timestamp'].iloc[-1] + + assert window == 'fixed' or window == 'sliding', "Unsupported window." + index = 0 + t0 = start_time + t1 = t0 + time_interval + while t1 < end_time: + sequence = [] + is_anomaly = 0 + # Make a sequence and label it as normal or abnormal. + while df['Timestamp'][index] < t1: + sequence.append(df['EventId'][index]) + if df['Label'][index] != '-': + is_anomaly = 1 + index += 1 + if sequence: + examples.append([sequence, is_anomaly]) + # Translate the window. + if window == "fixed": + t0 = t1 + elif window == "sliding": + t0 += stepping_size + t1 = t0 + time_interval + + random.shuffle(examples) + x = [t[0] for t in examples] + y = [t[1] for t in examples] + + n_train = int(len(x) * train_ratio) + + x_train = np.array(x[:n_train], dtype=list) + y_train = np.array(y[:n_train], dtype=int) + x_test = np.array(x[n_train:], dtype=list) + y_test = np.array(y[n_train:], dtype=int) + + print('Total: {} instances, {} anomaly, {} normal' \ + .format(len(y), sum(y), len(y) - sum(y))) + print('Train: {} instances, {} anomaly, {} normal' \ + .format(len(y_train), sum(y_train), len(y_train) - sum(y_train))) + print('Test: {} instances, {} anomaly, {} normal' \ + .format(len(y_test), sum(y_test), len(y_test) - sum(y_test))) + + return (x_train, y_train), (x_test, y_test) + +def thunderbird_preprocess_data(para, raw_data, event_mapping_data): + """ split logs into sliding windows, built an event count matrix and get the corresponding label + + Args: + -------- + para: the parameters dictionary + raw_data: list of (label, time) + event_mapping_data: a list of event index, where each row index indicates a corresponding log + + Returns: + -------- + event_count_matrix: event count matrix, where each row is an instance (log sequence vector) + labels: a list of labels, 1 represents anomaly + """ + + # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running + if not os.path.exists(para['save_path']): + os.mkdir(para['save_path']) + log_size = raw_data.shape[0] + sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' + + #=============divide into sliding windows=========# + start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window + label_data, time_data = raw_data[:,0], raw_data[:, 1] + if not os.path.exists(sliding_file_path): + # split into sliding window + start_time = time_data[0] + start_index = 0 + end_index = 0 + + # get the first start, end index, end time + for cur_time in time_data: + if cur_time < start_time + para['window_size']*3600: + end_index += 1 + end_time = cur_time + else: + start_end_pair=tuple((start_index,end_index)) + start_end_index_list.append(start_end_pair) + break + # move the start and end index until next sliding window + while end_index < log_size: + start_time = start_time + para['step_size']*3600 + end_time = end_time + para['step_size']*3600 + for i in range(start_index,end_index): + if time_data[i] < start_time: + i+=1 + else: + break + for j in range(end_index, log_size): + if time_data[j] < end_time: + j+=1 + else: + break + start_index = i + end_index = j + start_end_pair = tuple((start_index, end_index)) + start_end_index_list.append(start_end_pair) + inst_number = len(start_end_index_list) + print('there are %d instances (sliding windows) in this dataset\n'%inst_number) + np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d') + else: + print('Loading start_end_index_list from file') + start_end_index_list = pd.read_csv(sliding_file_path, header=None).values + inst_number = len(start_end_index_list) + print('there are %d instances (sliding windows) in this dataset' % inst_number) + + # get all the log indexes in each time window by ranging from start_index to end_index + expanded_indexes_list=[] + for t in range(inst_number): + index_list = [] + expanded_indexes_list.append(index_list) + for i in range(inst_number): + start_index = start_end_index_list[i][0] + end_index = start_end_index_list[i][1] + for l in range(start_index, end_index): + expanded_indexes_list[i].append(l) + + event_mapping_data = [row[0] for row in event_mapping_data] + event_num = len(list(set(event_mapping_data))) + print('There are %d log events'%event_num) + + #=============get labels and event count of each sliding window =========# + labels = [] + event_count_matrix = np.zeros((inst_number,event_num)) + for j in range(inst_number): + label = 0 #0 represent success, 1 represent failure + for k in expanded_indexes_list[j]: + event_index = event_mapping_data[k] + event_count_matrix[j, event_index] += 1 + if label_data[k]: + label = 1 + continue + labels.append(label) + assert inst_number == len(labels) + print("Among all instances, %d are anomalies"%sum(labels)) + assert event_count_matrix.shape[0] == len(labels) + return event_count_matrix, labels From 027e11539c1154590c81adf461f4f4a27d29d3a7 Mon Sep 17 00:00:00 2001 From: Johann Aschenloher Date: Fri, 11 Feb 2022 08:16:58 +0100 Subject: [PATCH 6/7] Remove unused code --- loglizer/dataloader/BGL.py | 121 +---------------------------- loglizer/dataloader/Thunderbird.py | 102 +----------------------- 2 files changed, 3 insertions(+), 220 deletions(-) diff --git a/loglizer/dataloader/BGL.py b/loglizer/dataloader/BGL.py index 9f7a738..d86581a 100644 --- a/loglizer/dataloader/BGL.py +++ b/loglizer/dataloader/BGL.py @@ -1,5 +1,5 @@ """ -The interface to load HDFS log datasets. +The interface to load BGL log datasets. Authors: Vincent-Therrien @@ -90,122 +90,3 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30, .format(len(y_test), sum(y_test), len(y_test) - sum(y_test))) return (x_train, y_train), (x_test, y_test) - - -def slice_hdfs(x, y, window_size): - results_data = [] - print("Slicing {} sessions, with window {}".format(x.shape[0], window_size)) - for idx, sequence in enumerate(x): - seqlen = len(sequence) - i = 0 - while (i + window_size) < seqlen: - slice = sequence[i: i + window_size] - results_data.append([idx, slice, sequence[i + window_size], y[idx]]) - i += 1 - else: - slice = sequence[i: i + window_size] - slice += ["#Pad"] * (window_size - len(slice)) - results_data.append([idx, slice, "#Pad", y[idx]]) - results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"]) - print("Slicing done, {} windows generated".format(results_df.shape[0])) - return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"] - - - -def bgl_preprocess_data(para, raw_data, event_mapping_data): - """ split logs into sliding windows, built an event count matrix and get the corresponding label - - Args: - -------- - para: the parameters dictionary - raw_data: list of (label, time) - event_mapping_data: a list of event index, where each row index indicates a corresponding log - - Returns: - -------- - event_count_matrix: event count matrix, where each row is an instance (log sequence vector) - labels: a list of labels, 1 represents anomaly - """ - - # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running - if not os.path.exists(para['save_path']): - os.mkdir(para['save_path']) - log_size = raw_data.shape[0] - sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' - - #=============divide into sliding windows=========# - start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window - label_data, time_data = raw_data[:,0], raw_data[:, 1] - if not os.path.exists(sliding_file_path): - # split into sliding window - start_time = time_data[0] - start_index = 0 - end_index = 0 - - # get the first start, end index, end time - for cur_time in time_data: - if cur_time < start_time + para['window_size']*3600: - end_index += 1 - end_time = cur_time - else: - start_end_pair=tuple((start_index,end_index)) - start_end_index_list.append(start_end_pair) - break - # move the start and end index until next sliding window - while end_index < log_size: - start_time = start_time + para['step_size']*3600 - end_time = end_time + para['step_size']*3600 - for i in range(start_index,end_index): - if time_data[i] < start_time: - i+=1 - else: - break - for j in range(end_index, log_size): - if time_data[j] < end_time: - j+=1 - else: - break - start_index = i - end_index = j - start_end_pair = tuple((start_index, end_index)) - start_end_index_list.append(start_end_pair) - inst_number = len(start_end_index_list) - print('there are %d instances (sliding windows) in this dataset\n'%inst_number) - np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d') - else: - print('Loading start_end_index_list from file') - start_end_index_list = pd.read_csv(sliding_file_path, header=None).values - inst_number = len(start_end_index_list) - print('there are %d instances (sliding windows) in this dataset' % inst_number) - - # get all the log indexes in each time window by ranging from start_index to end_index - expanded_indexes_list=[] - for t in range(inst_number): - index_list = [] - expanded_indexes_list.append(index_list) - for i in range(inst_number): - start_index = start_end_index_list[i][0] - end_index = start_end_index_list[i][1] - for l in range(start_index, end_index): - expanded_indexes_list[i].append(l) - - event_mapping_data = [row[0] for row in event_mapping_data] - event_num = len(list(set(event_mapping_data))) - print('There are %d log events'%event_num) - - #=============get labels and event count of each sliding window =========# - labels = [] - event_count_matrix = np.zeros((inst_number,event_num)) - for j in range(inst_number): - label = 0 #0 represent success, 1 represent failure - for k in expanded_indexes_list[j]: - event_index = event_mapping_data[k] - event_count_matrix[j, event_index] += 1 - if label_data[k]: - label = 1 - continue - labels.append(label) - assert inst_number == len(labels) - print("Among all instances, %d are anomalies"%sum(labels)) - assert event_count_matrix.shape[0] == len(labels) - return event_count_matrix, labels diff --git a/loglizer/dataloader/Thunderbird.py b/loglizer/dataloader/Thunderbird.py index 6497ad0..2b11e55 100644 --- a/loglizer/dataloader/Thunderbird.py +++ b/loglizer/dataloader/Thunderbird.py @@ -1,5 +1,5 @@ """ -The interface to load HDFS log datasets. +The interface to load Thunderbird log datasets. Authors: Hans Aschenloher @@ -38,7 +38,7 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30, # Load the file and sort lines according to time. df = pd.read_csv(log_file) - #df['Time'] = pd.to_datetime( str(df['Month'])+" " + str(df['Day']) + " " + str(df['Time']), format="%b %d %H:%M:%S") + df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f") df = df.sort_values(by="Timestamp") df.reset_index(drop=True, inplace=True) df['LineId'] = range(0, df.shape[0]) @@ -89,101 +89,3 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30, .format(len(y_test), sum(y_test), len(y_test) - sum(y_test))) return (x_train, y_train), (x_test, y_test) - -def thunderbird_preprocess_data(para, raw_data, event_mapping_data): - """ split logs into sliding windows, built an event count matrix and get the corresponding label - - Args: - -------- - para: the parameters dictionary - raw_data: list of (label, time) - event_mapping_data: a list of event index, where each row index indicates a corresponding log - - Returns: - -------- - event_count_matrix: event count matrix, where each row is an instance (log sequence vector) - labels: a list of labels, 1 represents anomaly - """ - - # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running - if not os.path.exists(para['save_path']): - os.mkdir(para['save_path']) - log_size = raw_data.shape[0] - sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' - - #=============divide into sliding windows=========# - start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window - label_data, time_data = raw_data[:,0], raw_data[:, 1] - if not os.path.exists(sliding_file_path): - # split into sliding window - start_time = time_data[0] - start_index = 0 - end_index = 0 - - # get the first start, end index, end time - for cur_time in time_data: - if cur_time < start_time + para['window_size']*3600: - end_index += 1 - end_time = cur_time - else: - start_end_pair=tuple((start_index,end_index)) - start_end_index_list.append(start_end_pair) - break - # move the start and end index until next sliding window - while end_index < log_size: - start_time = start_time + para['step_size']*3600 - end_time = end_time + para['step_size']*3600 - for i in range(start_index,end_index): - if time_data[i] < start_time: - i+=1 - else: - break - for j in range(end_index, log_size): - if time_data[j] < end_time: - j+=1 - else: - break - start_index = i - end_index = j - start_end_pair = tuple((start_index, end_index)) - start_end_index_list.append(start_end_pair) - inst_number = len(start_end_index_list) - print('there are %d instances (sliding windows) in this dataset\n'%inst_number) - np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d') - else: - print('Loading start_end_index_list from file') - start_end_index_list = pd.read_csv(sliding_file_path, header=None).values - inst_number = len(start_end_index_list) - print('there are %d instances (sliding windows) in this dataset' % inst_number) - - # get all the log indexes in each time window by ranging from start_index to end_index - expanded_indexes_list=[] - for t in range(inst_number): - index_list = [] - expanded_indexes_list.append(index_list) - for i in range(inst_number): - start_index = start_end_index_list[i][0] - end_index = start_end_index_list[i][1] - for l in range(start_index, end_index): - expanded_indexes_list[i].append(l) - - event_mapping_data = [row[0] for row in event_mapping_data] - event_num = len(list(set(event_mapping_data))) - print('There are %d log events'%event_num) - - #=============get labels and event count of each sliding window =========# - labels = [] - event_count_matrix = np.zeros((inst_number,event_num)) - for j in range(inst_number): - label = 0 #0 represent success, 1 represent failure - for k in expanded_indexes_list[j]: - event_index = event_mapping_data[k] - event_count_matrix[j, event_index] += 1 - if label_data[k]: - label = 1 - continue - labels.append(label) - assert inst_number == len(labels) - print("Among all instances, %d are anomalies"%sum(labels)) - assert event_count_matrix.shape[0] == len(labels) - return event_count_matrix, labels From a7fe3de7d3baf1486e74835acd172342da0572fc Mon Sep 17 00:00:00 2001 From: Johann Aschenloher Date: Fri, 11 Feb 2022 08:26:23 +0100 Subject: [PATCH 7/7] Update benchmark in respect to the dataloder changes --- benchmarks/HDFS_bechmark.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/benchmarks/HDFS_bechmark.py b/benchmarks/HDFS_bechmark.py index 788a8b4..c36dc35 100644 --- a/benchmarks/HDFS_bechmark.py +++ b/benchmarks/HDFS_bechmark.py @@ -5,27 +5,28 @@ sys.path.append('../') import pandas as pd from loglizer.models import * -from loglizer import dataloader, preprocessing +from loglizer import preprocessing +from loglizer.dataloader import HDFS -run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR', +run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR', 'SVM', 'DecisionTree'] struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset if __name__ == '__main__': - (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log, - window='session', - train_ratio=0.5, - split_type='uniform') + (x_tr, y_train), (x_te, y_test) = HDFS.loadDataset(struct_log, + window='session', + train_ratio=0.5, + split_type='uniform') benchmark_results = [] for _model in run_models: print('Evaluating {} on HDFS:'.format(_model)) if _model == 'PCA': feature_extractor = preprocessing.FeatureExtractor() - x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf', + x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf', normalization='zero-mean') model = PCA() model.fit(x_train) - + elif _model == 'InvariantsMiner': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_tr) @@ -41,7 +42,7 @@ elif _model == 'IsolationForest': feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_tr) - model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03, + model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03, n_jobs=4) model.fit(x_train) @@ -62,7 +63,7 @@ x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf') model = DecisionTree() model.fit(x_train, y_train) - + x_test = feature_extractor.transform(x_te) print('Train accuracy:') precision, recall, f1 = model.evaluate(x_train, y_train)