From 41c813cb961fe33652c57f388cd90aab8f1c41e9 Mon Sep 17 00:00:00 2001
From: Hans Aschenloher <hansi.aschenloher@gmail.com>
Date: Tue, 11 Jan 2022 10:17:22 +0100
Subject: [PATCH 1/7] add BGL dataloader

---
 loglizer/dataloader.py | 77 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py
index ae04778..78a229c 100644
--- a/loglizer/dataloader.py
+++ b/loglizer/dataloader.py
@@ -7,6 +7,7 @@
 
 """
 
+import random
 import pandas as pd
 import os
 import numpy as np
@@ -162,12 +163,82 @@ def slice_hdfs(x, y, window_size):
 
 
 
-def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60, 
+def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
              train_ratio=0.8):
-    """  TODO
+    """ Read a BGL log file to obtain training and test data.
+
+    Args:
+    --------
+    log_file: Input file name with the header
+        "LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for
+        unnecessary fields.
+    windows: Type of windows to use. Can be either 'sliding' or 'fixed'.
+    time_interval: Time scope of a window in seconds. Used for both fixed and
+        sliding windows.
+    stepping_size: Step size of sliding windows in seconds. Used only for
+        sliding windows.
+    train_ratio: Fraction of examples to use for training.
+
+    Returns
+    -------
+        (x_train, y_train): The training data.
+        (x_test, y_test): The testing data.
 
     """
 
+    # Load the file and sort lines according to time.
+    df = pd.read_csv(log_file)
+    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f")
+    df = df.sort_values(by="Time")
+    df.reset_index(drop=True, inplace=True)
+    df['LineId'] = range(0, df.shape[0]) 
+
+    examples = [] # List of sequences and anomaly labels.
+    
+    start_time = df['Timestamp'][0]
+    end_time = df['Timestamp'].iloc[-1]
+
+    assert window == 'fixed' or window == 'sliding', "Unsupported window."
+    index = 0
+    t0 = start_time
+    t1 = t0 + time_interval
+    while t1 < end_time:
+        sequence = []
+        is_anomaly = 0
+        # Make a sequence and label it as normal or abnormal.
+        while df['Timestamp'][index] < t1:
+            sequence.append(df['EventId'][index])
+            if df['Label'][index] != '-':
+                is_anomaly = 1
+            index += 1
+        if sequence:
+            examples.append([sequence, is_anomaly])
+        # Translate the window.
+        if window == "fixed":
+            t0 = t1
+        elif window == "sliding":
+            t0 += stepping_size
+        t1 = t0 + time_interval
+    
+    random.shuffle(examples)
+    x = [t[0] for t in examples]
+    y = [t[1] for t in examples]
+    
+    n_train = int(len(x) * train_ratio)
+
+    x_train = np.array(x[:n_train], dtype=list)
+    y_train = np.array(y[:n_train], dtype=int)
+    x_test  = np.array(x[n_train:], dtype=list)
+    y_test  = np.array(y[n_train:], dtype=int)
+
+    print('Total: {} instances, {} anomaly, {} normal' \
+          .format(len(y), sum(y), len(y) - sum(y)))
+    print('Train: {} instances, {} anomaly, {} normal' \
+          .format(len(y_train), sum(y_train), len(y_train) - sum(y_train)))
+    print('Test: {} instances, {} anomaly, {} normal' \
+          .format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))
+    
+    return (x_train, y_train), (x_test, y_test)
 
 def bgl_preprocess_data(para, raw_data, event_mapping_data):
     """ split logs into sliding windows, built an event count matrix and get the corresponding label
@@ -265,4 +336,4 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
     assert inst_number == len(labels)
     print("Among all instances, %d are anomalies"%sum(labels))
     assert event_count_matrix.shape[0] == len(labels)
-    return event_count_matrix, labels
+    return event_count_matrix, labels
\ No newline at end of file

From 92df1217aa0233164826150e9e5889dffe77bb90 Mon Sep 17 00:00:00 2001
From: Hans Aschenloher <hansi.aschenloher@gmail.com>
Date: Tue, 11 Jan 2022 10:51:21 +0100
Subject: [PATCH 2/7] Refactor dataloders into seperate files

---
 loglizer/{dataloader.py => dataloader/BGL.py} | 188 +++---------------
 loglizer/dataloader/HDFS.py                   | 163 +++++++++++++++
 2 files changed, 193 insertions(+), 158 deletions(-)
 rename loglizer/{dataloader.py => dataloader/BGL.py} (55%)
 create mode 100644 loglizer/dataloader/HDFS.py

diff --git a/loglizer/dataloader.py b/loglizer/dataloader/BGL.py
similarity index 55%
rename from loglizer/dataloader.py
rename to loglizer/dataloader/BGL.py
index 78a229c..c683e13 100644
--- a/loglizer/dataloader.py
+++ b/loglizer/dataloader/BGL.py
@@ -1,10 +1,9 @@
 """
-The interface to load log datasets. The datasets currently supported include
-HDFS and BGL.
+The interface to load HDFS log datasets.
 
 Authors:
-    LogPAI Team
-
+    Vincent-Therrien
+    Hans Aschenloher
 """
 
 import random
@@ -15,154 +14,6 @@
 from sklearn.utils import shuffle
 from collections import OrderedDict
 
-def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
-    if split_type == 'uniform' and y_data is not None:
-        pos_idx = y_data > 0
-        x_pos = x_data[pos_idx]
-        y_pos = y_data[pos_idx]
-        x_neg = x_data[~pos_idx]
-        y_neg = y_data[~pos_idx]
-        train_pos = int(train_ratio * x_pos.shape[0])
-        train_neg = int(train_ratio * x_neg.shape[0])
-        x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
-        y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
-        x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
-        y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
-    elif split_type == 'sequential':
-        num_train = int(train_ratio * x_data.shape[0])
-        x_train = x_data[0:num_train]
-        x_test = x_data[num_train:]
-        if y_data is None:
-            y_train = None
-            y_test = None
-        else:
-            y_train = y_data[0:num_train]
-            y_test = y_data[num_train:]
-    # Random shuffle
-    indexes = shuffle(np.arange(x_train.shape[0]))
-    x_train = x_train[indexes]
-    if y_train is not None:
-        y_train = y_train[indexes]
-    return (x_train, y_train), (x_test, y_test)
-
-def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
-    """ Load HDFS structured log into train and test data
-
-    Arguments
-    ---------
-        log_file: str, the file path of structured log.
-        label_file: str, the file path of anomaly labels, None for unlabeled data
-        window: str, the window options including `session` (default).
-        train_ratio: float, the ratio of training data for train/test split.
-        split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
-            to split positive samples and negative samples equally when setting label_file. `sequential`
-            means to split the data sequentially without label_file. That is, the first part is for training,
-            while the second part is for testing.
-
-    Returns
-    -------
-        (x_train, y_train): the training data
-        (x_test, y_test): the testing data
-    """
-
-    print('====== Input data summary ======')
-
-    if log_file.endswith('.npz'):
-        # Split training and validation set in a class-uniform way
-        data = np.load(log_file)
-        x_data = data['x_data']
-        y_data = data['y_data']
-        (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
-
-    elif log_file.endswith('.csv'):
-        assert window == 'session', "Only window=session is supported for HDFS dataset."
-        print("Loading", log_file)
-        struct_log = pd.read_csv(log_file, engine='c',
-                na_filter=False, memory_map=True)
-        data_dict = OrderedDict()
-        for idx, row in struct_log.iterrows():
-            blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
-            blkId_set = set(blkId_list)
-            for blk_Id in blkId_set:
-                if not blk_Id in data_dict:
-                    data_dict[blk_Id] = []
-                data_dict[blk_Id].append(row['EventId'])
-        data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
-        
-        if label_file:
-            # Split training and validation set in a class-uniform way
-            label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
-            label_data = label_data.set_index('BlockId')
-            label_dict = label_data['Label'].to_dict()
-            data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
-
-            # Split train and test data
-            (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, 
-                data_df['Label'].values, train_ratio, split_type)
-        
-            print(y_train.sum(), y_test.sum())
-
-        if save_csv:
-            data_df.to_csv('data_instances.csv', index=False)
-
-        if window_size > 0:
-            x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
-            x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
-            log = "{} {} windows ({}/{} anomaly), {}/{} normal"
-            print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
-            print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
-            return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
-
-        if label_file is None:
-            if split_type == 'uniform':
-                split_type = 'sequential'
-                print('Warning: Only split_type=sequential is supported \
-                if label_file=None.'.format(split_type))
-            # Split training and validation set sequentially
-            x_data = data_df['EventSequence'].values
-            (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
-            print('Total: {} instances, train: {} instances, test: {} instances'.format(
-                  x_data.shape[0], x_train.shape[0], x_test.shape[0]))
-            return (x_train, None), (x_test, None), data_df
-    else:
-        raise NotImplementedError('load_HDFS() only support csv and npz files!')
-
-    num_train = x_train.shape[0]
-    num_test = x_test.shape[0]
-    num_total = num_train + num_test
-    num_train_pos = sum(y_train)
-    num_test_pos = sum(y_test)
-    num_pos = num_train_pos + num_test_pos
-
-    print('Total: {} instances, {} anomaly, {} normal' \
-          .format(num_total, num_pos, num_total - num_pos))
-    print('Train: {} instances, {} anomaly, {} normal' \
-          .format(num_train, num_train_pos, num_train - num_train_pos))
-    print('Test: {} instances, {} anomaly, {} normal\n' \
-          .format(num_test, num_test_pos, num_test - num_test_pos))
-
-    return (x_train, y_train), (x_test, y_test)
-
-def slice_hdfs(x, y, window_size):
-    results_data = []
-    print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
-    for idx, sequence in enumerate(x):
-        seqlen = len(sequence)
-        i = 0
-        while (i + window_size) < seqlen:
-            slice = sequence[i: i + window_size]
-            results_data.append([idx, slice, sequence[i + window_size], y[idx]])
-            i += 1
-        else:
-            slice = sequence[i: i + window_size]
-            slice += ["#Pad"] * (window_size - len(slice))
-            results_data.append([idx, slice, "#Pad", y[idx]])
-    results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
-    print("Slicing done, {} windows generated".format(results_df.shape[0]))
-    return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
-
-
-
 def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
              train_ratio=0.8):
     """ Read a BGL log file to obtain training and test data.
@@ -191,10 +42,10 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
     df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f")
     df = df.sort_values(by="Time")
     df.reset_index(drop=True, inplace=True)
-    df['LineId'] = range(0, df.shape[0]) 
+    df['LineId'] = range(0, df.shape[0])
 
     examples = [] # List of sequences and anomaly labels.
-    
+
     start_time = df['Timestamp'][0]
     end_time = df['Timestamp'].iloc[-1]
 
@@ -219,11 +70,11 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
         elif window == "sliding":
             t0 += stepping_size
         t1 = t0 + time_interval
-    
+
     random.shuffle(examples)
     x = [t[0] for t in examples]
     y = [t[1] for t in examples]
-    
+
     n_train = int(len(x) * train_ratio)
 
     x_train = np.array(x[:n_train], dtype=list)
@@ -237,9 +88,30 @@ def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
           .format(len(y_train), sum(y_train), len(y_train) - sum(y_train)))
     print('Test: {} instances, {} anomaly, {} normal' \
           .format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))
-    
+
     return (x_train, y_train), (x_test, y_test)
 
+
+def slice_hdfs(x, y, window_size):
+    results_data = []
+    print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
+    for idx, sequence in enumerate(x):
+        seqlen = len(sequence)
+        i = 0
+        while (i + window_size) < seqlen:
+            slice = sequence[i: i + window_size]
+            results_data.append([idx, slice, sequence[i + window_size], y[idx]])
+            i += 1
+        else:
+            slice = sequence[i: i + window_size]
+            slice += ["#Pad"] * (window_size - len(slice))
+            results_data.append([idx, slice, "#Pad", y[idx]])
+    results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
+    print("Slicing done, {} windows generated".format(results_df.shape[0]))
+    return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
+
+
+
 def bgl_preprocess_data(para, raw_data, event_mapping_data):
     """ split logs into sliding windows, built an event count matrix and get the corresponding label
 
@@ -336,4 +208,4 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
     assert inst_number == len(labels)
     print("Among all instances, %d are anomalies"%sum(labels))
     assert event_count_matrix.shape[0] == len(labels)
-    return event_count_matrix, labels
\ No newline at end of file
+    return event_count_matrix, labels
diff --git a/loglizer/dataloader/HDFS.py b/loglizer/dataloader/HDFS.py
new file mode 100644
index 0000000..157d9f7
--- /dev/null
+++ b/loglizer/dataloader/HDFS.py
@@ -0,0 +1,163 @@
+"""
+The interface to load HDFS log datasets.
+
+Authors:
+    LogPAI Team
+    Hans Aschenloher
+"""
+
+import random
+import pandas as pd
+import os
+import numpy as np
+import re
+from sklearn.utils import shuffle
+from collections import OrderedDict
+
+def loadDataset(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
+    """ Load HDFS structured log into train and test data
+
+    Arguments
+    ---------
+        log_file: str, the file path of structured log.
+        label_file: str, the file path of anomaly labels, None for unlabeled data
+        window: str, the window options including `session` (default).
+        train_ratio: float, the ratio of training data for train/test split.
+        split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
+            to split positive samples and negative samples equally when setting label_file. `sequential`
+            means to split the data sequentially without label_file. That is, the first part is for training,
+            while the second part is for testing.
+
+    Returns
+    -------
+        (x_train, y_train): the training data
+        (x_test, y_test): the testing data
+    """
+
+    print('====== Input data summary ======')
+
+    if log_file.endswith('.npz'):
+        # Split training and validation set in a class-uniform way
+        data = np.load(log_file)
+        x_data = data['x_data']
+        y_data = data['y_data']
+        (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
+
+    elif log_file.endswith('.csv'):
+        assert window == 'session', "Only window=session is supported for HDFS dataset."
+        print("Loading", log_file)
+        struct_log = pd.read_csv(log_file, engine='c',
+                na_filter=False, memory_map=True)
+        data_dict = OrderedDict()
+        for idx, row in struct_log.iterrows():
+            blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
+            blkId_set = set(blkId_list)
+            for blk_Id in blkId_set:
+                if not blk_Id in data_dict:
+                    data_dict[blk_Id] = []
+                data_dict[blk_Id].append(row['EventId'])
+        data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
+
+        if label_file:
+            # Split training and validation set in a class-uniform way
+            label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
+            label_data = label_data.set_index('BlockId')
+            label_dict = label_data['Label'].to_dict()
+            data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
+
+            # Split train and test data
+            (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
+                data_df['Label'].values, train_ratio, split_type)
+
+            print(y_train.sum(), y_test.sum())
+
+        if save_csv:
+            data_df.to_csv('data_instances.csv', index=False)
+
+        if window_size > 0:
+            x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
+            x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
+            log = "{} {} windows ({}/{} anomaly), {}/{} normal"
+            print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
+            print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
+            return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
+
+        if label_file is None:
+            if split_type == 'uniform':
+                split_type = 'sequential'
+                print('Warning: Only split_type=sequential is supported \
+                if label_file=None.'.format(split_type))
+            # Split training and validation set sequentially
+            x_data = data_df['EventSequence'].values
+            (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
+            print('Total: {} instances, train: {} instances, test: {} instances'.format(
+                  x_data.shape[0], x_train.shape[0], x_test.shape[0]))
+            return (x_train, None), (x_test, None), data_df
+    else:
+        raise NotImplementedError('load_HDFS() only support csv and npz files!')
+
+    num_train = x_train.shape[0]
+    num_test = x_test.shape[0]
+    num_total = num_train + num_test
+    num_train_pos = sum(y_train)
+    num_test_pos = sum(y_test)
+    num_pos = num_train_pos + num_test_pos
+
+    print('Total: {} instances, {} anomaly, {} normal' \
+          .format(num_total, num_pos, num_total - num_pos))
+    print('Train: {} instances, {} anomaly, {} normal' \
+          .format(num_train, num_train_pos, num_train - num_train_pos))
+    print('Test: {} instances, {} anomaly, {} normal\n' \
+          .format(num_test, num_test_pos, num_test - num_test_pos))
+
+    return (x_train, y_train), (x_test, y_test)
+
+def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
+    if split_type == 'uniform' and y_data is not None:
+        pos_idx = y_data > 0
+        x_pos = x_data[pos_idx]
+        y_pos = y_data[pos_idx]
+        x_neg = x_data[~pos_idx]
+        y_neg = y_data[~pos_idx]
+        train_pos = int(train_ratio * x_pos.shape[0])
+        train_neg = int(train_ratio * x_neg.shape[0])
+        x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
+        y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
+        x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
+        y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
+    elif split_type == 'sequential':
+        num_train = int(train_ratio * x_data.shape[0])
+        x_train = x_data[0:num_train]
+        x_test = x_data[num_train:]
+        if y_data is None:
+            y_train = None
+            y_test = None
+        else:
+            y_train = y_data[0:num_train]
+            y_test = y_data[num_train:]
+    # Random shuffle
+    indexes = shuffle(np.arange(x_train.shape[0]))
+    x_train = x_train[indexes]
+    if y_train is not None:
+        y_train = y_train[indexes]
+    return (x_train, y_train), (x_test, y_test)
+
+
+def slice_hdfs(x, y, window_size):
+    results_data = []
+    print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
+    for idx, sequence in enumerate(x):
+        seqlen = len(sequence)
+        i = 0
+        while (i + window_size) < seqlen:
+            slice = sequence[i: i + window_size]
+            results_data.append([idx, slice, sequence[i + window_size], y[idx]])
+            i += 1
+        else:
+            slice = sequence[i: i + window_size]
+            slice += ["#Pad"] * (window_size - len(slice))
+            results_data.append([idx, slice, "#Pad", y[idx]])
+    results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
+    print("Slicing done, {} windows generated".format(results_df.shape[0]))
+    return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
+

From f8548709d2ecbaf5abdf0e79ba9439b520d1593c Mon Sep 17 00:00:00 2001
From: Hans Aschenloher <hansi.aschenloher@gmail.com>
Date: Tue, 11 Jan 2022 11:01:30 +0100
Subject: [PATCH 3/7] Update the demos in respect to the dataloder changes

---
 demo/DecisionTree_demo.py                   |  8 +++++---
 demo/DeepLog_demo.py                        |  9 +++++----
 demo/InvariantsMiner_demo.py                | 10 ++++++----
 demo/InvariantsMiner_demo_without_labels.py | 16 ++++++++-------
 demo/IsolationForest_demo.py                |  9 +++++----
 demo/LR_demo.py                             |  8 +++++---
 demo/LogClustering_demo.py                  | 10 ++++++----
 demo/PCA_demo.py                            | 12 ++++++-----
 demo/PCA_demo_without_labels.py             | 22 +++++++++++----------
 demo/SVM_demo.py                            |  8 +++++---
 10 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/demo/DecisionTree_demo.py b/demo/DecisionTree_demo.py
index 387b186..0c9d46e 100644
--- a/demo/DecisionTree_demo.py
+++ b/demo/DecisionTree_demo.py
@@ -4,15 +4,17 @@
 import sys
 sys.path.append('../')
 from loglizer.models import DecisionTree
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
 
diff --git a/demo/DeepLog_demo.py b/demo/DeepLog_demo.py
index b450ec9..28dd28c 100644
--- a/demo/DeepLog_demo.py
+++ b/demo/DeepLog_demo.py
@@ -6,7 +6,8 @@
 from loglizer import dataloader
 from loglizer.models import DeepLog
 from loglizer.preprocessing import Vectorizer, Iterator
-
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 batch_size = 32
 hidden_size = 32
@@ -16,14 +17,14 @@
 window_size = 10
 epoches = 2
 num_workers = 2
-device = 0 
+device = 0
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 
 if __name__ == '__main__':
-    (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
-    
+    (x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
+
     feature_extractor = Vectorizer()
     train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train)
     test_dataset = feature_extractor.transform(x_test, window_y_test, y_test)
diff --git a/demo/InvariantsMiner_demo.py b/demo/InvariantsMiner_demo.py
index 05a7c61..bde0c68 100644
--- a/demo/InvariantsMiner_demo.py
+++ b/demo/InvariantsMiner_demo.py
@@ -4,16 +4,18 @@
 import sys
 sys.path.append('../')
 from loglizer.models import InvariantsMiner
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 epsilon = 0.5 # threshold for estimating invariant space
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='sequential')
     feature_extractor = preprocessing.FeatureExtractor()
@@ -25,7 +27,7 @@
 
     print('Train validation:')
     precision, recall, f1 = model.evaluate(x_train, y_train)
-    
+
     print('Test validation:')
     precision, recall, f1 = model.evaluate(x_test, y_test)
 
diff --git a/demo/InvariantsMiner_demo_without_labels.py b/demo/InvariantsMiner_demo_without_labels.py
index 47a9e6c..876757e 100644
--- a/demo/InvariantsMiner_demo_without_labels.py
+++ b/demo/InvariantsMiner_demo_without_labels.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 ''' This is a demo file for the Invariants Mining model.
     API usage:
-        dataloader.load_HDFS(): load HDFS dataset
+        HDFS.loadDataset(): load HDFS dataset
         feature_extractor.fit_transform(): fit and transform features
         feature_extractor.transform(): feature transform after fitting
         model.fit(): fit the model
@@ -13,7 +13,9 @@
 import sys
 sys.path.append('../')
 from loglizer.models import InvariantsMiner
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
@@ -21,8 +23,8 @@
 
 if __name__ == '__main__':
     # Load structured log without label info
-    (x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log,
-                                                     window='session', 
+    (x_train, _), (x_test, _) = HDFS.loadDataset(struct_log,
+                                                     window='session',
                                                      train_ratio=0.5,
                                                      split_type='sequential')
     # Feature extraction
@@ -43,11 +45,11 @@
 
     # If you have labeled data, you can evaluate the accuracy of the model as well.
     # Load structured log with label info
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                label_file=label_file,
-                                                               window='session', 
+                                                               window='session',
                                                                train_ratio=0.5,
-                                                               split_type='sequential')   
+                                                               split_type='sequential')
     x_test = feature_extractor.transform(x_test)
     precision, recall, f1 = model.evaluate(x_test, y_test)
 
diff --git a/demo/IsolationForest_demo.py b/demo/IsolationForest_demo.py
index e2cb63d..274eea2 100644
--- a/demo/IsolationForest_demo.py
+++ b/demo/IsolationForest_demo.py
@@ -4,16 +4,17 @@
 import sys
 sys.path.append('../')
 from loglizer.models import IsolationForest
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 anomaly_ratio = 0.03 # Estimate the ratio of anomaly samples in the data
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
     feature_extractor = preprocessing.FeatureExtractor()
@@ -25,7 +26,7 @@
 
     print('Train validation:')
     precision, recall, f1 = model.evaluate(x_train, y_train)
-    
+
     print('Test validation:')
     precision, recall, f1 = model.evaluate(x_test, y_test)
 
diff --git a/demo/LR_demo.py b/demo/LR_demo.py
index 56ee5ce..0da1a52 100644
--- a/demo/LR_demo.py
+++ b/demo/LR_demo.py
@@ -4,15 +4,17 @@
 import sys
 sys.path.append('../')
 from loglizer.models import LR
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
 
diff --git a/demo/LogClustering_demo.py b/demo/LogClustering_demo.py
index 0c82a09..d3dc852 100644
--- a/demo/LogClustering_demo.py
+++ b/demo/LogClustering_demo.py
@@ -4,7 +4,9 @@
 import sys
 sys.path.append('../')
 from loglizer.models import LogClustering
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
@@ -12,9 +14,9 @@
 anomaly_threshold = 0.3 # the threshold for anomaly detection
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
     feature_extractor = preprocessing.FeatureExtractor()
@@ -26,6 +28,6 @@
 
     print('Train validation:')
     precision, recall, f1 = model.evaluate(x_train, y_train)
-    
+
     print('Test validation:')
     precision, recall, f1 = model.evaluate(x_test, y_test)
diff --git a/demo/PCA_demo.py b/demo/PCA_demo.py
index 74a1892..7d47a59 100644
--- a/demo/PCA_demo.py
+++ b/demo/PCA_demo.py
@@ -4,19 +4,21 @@
 import sys
 sys.path.append('../')
 from loglizer.models import PCA
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
     feature_extractor = preprocessing.FeatureExtractor()
-    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', 
+    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
                                               normalization='zero-mean')
     x_test = feature_extractor.transform(x_test)
 
@@ -25,6 +27,6 @@
 
     print('Train validation:')
     precision, recall, f1 = model.evaluate(x_train, y_train)
-    
+
     print('Test validation:')
     precision, recall, f1 = model.evaluate(x_test, y_test)
diff --git a/demo/PCA_demo_without_labels.py b/demo/PCA_demo_without_labels.py
index d54a1c0..6ada2f3 100644
--- a/demo/PCA_demo_without_labels.py
+++ b/demo/PCA_demo_without_labels.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 ''' This is a demo file for the PCA model.
     API usage:
-        dataloader.load_HDFS(): load HDFS dataset
+        HDFS.loadDataset(): load HDFS dataset
         feature_extractor.fit_transform(): fit and transform features
         feature_extractor.transform(): feature transform after fitting
         model.fit(): fit the model
@@ -13,36 +13,38 @@
 import sys
 sys.path.append('../')
 from loglizer.models import PCA
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 
 if __name__ == '__main__':
     ## 1. Load strutured log file and extract feature vectors
     # Save the raw event sequence file by setting save_csv=True
-    (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', 
+    (x_train, _), (_, _) = HDFS.loadDataset(struct_log, window='session',
                                                 split_type='sequential', save_csv=True)
     feature_extractor = preprocessing.FeatureExtractor()
-    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', 
+    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
                                               normalization='zero-mean')
-    
+
     ## 2. Train an unsupervised model
     print('Train phase:')
     # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
-    model = PCA() 
+    model = PCA()
     # Model hyper-parameters may be sensitive to log data, here we use the default for demo
     model.fit(x_train)
     # Make predictions and manually check for correctness. Details may need to go into the raw logs
-    y_train = model.predict(x_train) 
+    y_train = model.predict(x_train)
 
     ## 3. Use the trained model for online anomaly detection
     print('Test phase:')
     # Load another new log file. Here we use struct_log for demo only
-    (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential')
+    (x_test, _), (_, _) = HDFS.loadDataset(struct_log, window='session', split_type='sequential')
     # Go through the same feature extraction process with training, using transform() instead
-    x_test = feature_extractor.transform(x_test) 
+    x_test = feature_extractor.transform(x_test)
     # Finally make predictions and alter on anomaly cases
     y_test = model.predict(x_test)
-    
+
 
 
diff --git a/demo/SVM_demo.py b/demo/SVM_demo.py
index 088ffe7..e6cb76f 100644
--- a/demo/SVM_demo.py
+++ b/demo/SVM_demo.py
@@ -4,15 +4,17 @@
 import sys
 sys.path.append('../')
 from loglizer.models import SVM
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
 struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
 
 if __name__ == '__main__':
-    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
+    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                                 label_file=label_file,
-                                                                window='session', 
+                                                                window='session',
                                                                 train_ratio=0.5,
                                                                 split_type='uniform')
 

From 66c8cd8820460282cf02c9a4c8560493195e81f2 Mon Sep 17 00:00:00 2001
From: Hans Aschenloher <hansi.aschenloher@gmail.com>
Date: Tue, 11 Jan 2022 15:30:46 +0100
Subject: [PATCH 4/7] Refactor load_BGL mehtod name

---
 loglizer/dataloader/BGL.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loglizer/dataloader/BGL.py b/loglizer/dataloader/BGL.py
index c683e13..9f7a738 100644
--- a/loglizer/dataloader/BGL.py
+++ b/loglizer/dataloader/BGL.py
@@ -14,7 +14,7 @@
 from sklearn.utils import shuffle
 from collections import OrderedDict
 
-def load_BGL(log_file, window='sliding', time_interval=60, stepping_size=30,
+def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
              train_ratio=0.8):
     """ Read a BGL log file to obtain training and test data.
 

From a0cce9e32b4bb83fda7ee4e8bee3d8ed1ab5e3f9 Mon Sep 17 00:00:00 2001
From: Hans Aschenloher <hansi.aschenloher@gmail.com>
Date: Tue, 8 Feb 2022 23:09:58 +0100
Subject: [PATCH 5/7] Add a thunderbird dataloader

---
 loglizer/dataloader/Thunderbird.py | 189 +++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 loglizer/dataloader/Thunderbird.py

diff --git a/loglizer/dataloader/Thunderbird.py b/loglizer/dataloader/Thunderbird.py
new file mode 100644
index 0000000..6497ad0
--- /dev/null
+++ b/loglizer/dataloader/Thunderbird.py
@@ -0,0 +1,189 @@
+"""
+The interface to load HDFS log datasets.
+
+Authors:
+    Hans Aschenloher
+"""
+
+import random
+import pandas as pd
+import os
+import numpy as np
+import re
+from sklearn.utils import shuffle
+from collections import OrderedDict
+
+def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
+             train_ratio=0.8):
+    """ Read a Thunderbird log file to obtain training and test data.
+
+    Args:
+    --------
+    log_file: Input file name with the header
+        "LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for
+        unnecessary fields.
+    windows: Type of windows to use. Can be either 'sliding' or 'fixed'.
+    time_interval: Time scope of a window in seconds. Used for both fixed and
+        sliding windows.
+    stepping_size: Step size of sliding windows in seconds. Used only for
+        sliding windows.
+    train_ratio: Fraction of examples to use for training.
+
+    Returns
+    -------
+        (x_train, y_train): The training data.
+        (x_test, y_test): The testing data.
+
+    """
+
+    # Load the file and sort lines according to time.
+    df = pd.read_csv(log_file)
+    #df['Time'] = pd.to_datetime( str(df['Month'])+" " + str(df['Day']) + " " + str(df['Time']), format="%b %d %H:%M:%S")
+    df = df.sort_values(by="Timestamp")
+    df.reset_index(drop=True, inplace=True)
+    df['LineId'] = range(0, df.shape[0])
+
+    examples = [] # List of sequences and anomaly labels.
+
+    start_time = df['Timestamp'][0]
+    end_time = df['Timestamp'].iloc[-1]
+
+    assert window == 'fixed' or window == 'sliding', "Unsupported window."
+    index = 0
+    t0 = start_time
+    t1 = t0 + time_interval
+    while t1 < end_time:
+        sequence = []
+        is_anomaly = 0
+        # Make a sequence and label it as normal or abnormal.
+        while df['Timestamp'][index] < t1:
+            sequence.append(df['EventId'][index])
+            if df['Label'][index] != '-':
+                is_anomaly = 1
+            index += 1
+        if sequence:
+            examples.append([sequence, is_anomaly])
+        # Translate the window.
+        if window == "fixed":
+            t0 = t1
+        elif window == "sliding":
+            t0 += stepping_size
+        t1 = t0 + time_interval
+
+    random.shuffle(examples)
+    x = [t[0] for t in examples]
+    y = [t[1] for t in examples]
+
+    n_train = int(len(x) * train_ratio)
+
+    x_train = np.array(x[:n_train], dtype=list)
+    y_train = np.array(y[:n_train], dtype=int)
+    x_test  = np.array(x[n_train:], dtype=list)
+    y_test  = np.array(y[n_train:], dtype=int)
+
+    print('Total: {} instances, {} anomaly, {} normal' \
+          .format(len(y), sum(y), len(y) - sum(y)))
+    print('Train: {} instances, {} anomaly, {} normal' \
+          .format(len(y_train), sum(y_train), len(y_train) - sum(y_train)))
+    print('Test: {} instances, {} anomaly, {} normal' \
+          .format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))
+
+    return (x_train, y_train), (x_test, y_test)
+
+def thunderbird_preprocess_data(para, raw_data, event_mapping_data):
+    """ split logs into sliding windows, built an event count matrix and get the corresponding label
+
+    Args:
+    --------
+    para: the parameters dictionary
+    raw_data: list of (label, time)
+    event_mapping_data: a list of event index, where each row index indicates a corresponding log
+
+    Returns:
+    --------
+    event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
+    labels: a list of labels, 1 represents anomaly
+    """
+
+    # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
+    if not os.path.exists(para['save_path']):
+        os.mkdir(para['save_path'])
+    log_size = raw_data.shape[0]
+    sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
+
+    #=============divide into sliding windows=========#
+    start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
+    label_data, time_data = raw_data[:,0], raw_data[:, 1]
+    if not os.path.exists(sliding_file_path):
+        # split into sliding window
+        start_time = time_data[0]
+        start_index = 0
+        end_index = 0
+
+        # get the first start, end index, end time
+        for cur_time in time_data:
+            if  cur_time < start_time + para['window_size']*3600:
+                end_index += 1
+                end_time = cur_time
+            else:
+                start_end_pair=tuple((start_index,end_index))
+                start_end_index_list.append(start_end_pair)
+                break
+        # move the start and end index until next sliding window
+        while end_index < log_size:
+            start_time = start_time + para['step_size']*3600
+            end_time = end_time + para['step_size']*3600
+            for i in range(start_index,end_index):
+                if time_data[i] < start_time:
+                    i+=1
+                else:
+                    break
+            for j in range(end_index, log_size):
+                if time_data[j] < end_time:
+                    j+=1
+                else:
+                    break
+            start_index = i
+            end_index = j
+            start_end_pair = tuple((start_index, end_index))
+            start_end_index_list.append(start_end_pair)
+        inst_number = len(start_end_index_list)
+        print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
+        np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
+    else:
+        print('Loading start_end_index_list from file')
+        start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
+        inst_number = len(start_end_index_list)
+        print('there are %d instances (sliding windows) in this dataset' % inst_number)
+
+    # get all the log indexes in each time window by ranging from start_index to end_index
+    expanded_indexes_list=[]
+    for t in range(inst_number):
+        index_list = []
+        expanded_indexes_list.append(index_list)
+    for i in range(inst_number):
+        start_index = start_end_index_list[i][0]
+        end_index = start_end_index_list[i][1]
+        for l in range(start_index, end_index):
+            expanded_indexes_list[i].append(l)
+
+    event_mapping_data = [row[0] for row in event_mapping_data]
+    event_num = len(list(set(event_mapping_data)))
+    print('There are %d log events'%event_num)
+
+    #=============get labels and event count of each sliding window =========#
+    labels = []
+    event_count_matrix = np.zeros((inst_number,event_num))
+    for j in range(inst_number):
+        label = 0   #0 represent success, 1 represent failure
+        for k in expanded_indexes_list[j]:
+            event_index = event_mapping_data[k]
+            event_count_matrix[j, event_index] += 1
+            if label_data[k]:
+                label = 1
+                continue
+        labels.append(label)
+    assert inst_number == len(labels)
+    print("Among all instances, %d are anomalies"%sum(labels))
+    assert event_count_matrix.shape[0] == len(labels)
+    return event_count_matrix, labels

From 027e11539c1154590c81adf461f4f4a27d29d3a7 Mon Sep 17 00:00:00 2001
From: Johann Aschenloher <ja@safri.net>
Date: Fri, 11 Feb 2022 08:16:58 +0100
Subject: [PATCH 6/7] Remove unused code

---
 loglizer/dataloader/BGL.py         | 121 +----------------------------
 loglizer/dataloader/Thunderbird.py | 102 +-----------------------
 2 files changed, 3 insertions(+), 220 deletions(-)

diff --git a/loglizer/dataloader/BGL.py b/loglizer/dataloader/BGL.py
index 9f7a738..d86581a 100644
--- a/loglizer/dataloader/BGL.py
+++ b/loglizer/dataloader/BGL.py
@@ -1,5 +1,5 @@
 """
-The interface to load HDFS log datasets.
+The interface to load BGL log datasets.
 
 Authors:
     Vincent-Therrien
@@ -90,122 +90,3 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
           .format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))
 
     return (x_train, y_train), (x_test, y_test)
-
-
-def slice_hdfs(x, y, window_size):
-    results_data = []
-    print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
-    for idx, sequence in enumerate(x):
-        seqlen = len(sequence)
-        i = 0
-        while (i + window_size) < seqlen:
-            slice = sequence[i: i + window_size]
-            results_data.append([idx, slice, sequence[i + window_size], y[idx]])
-            i += 1
-        else:
-            slice = sequence[i: i + window_size]
-            slice += ["#Pad"] * (window_size - len(slice))
-            results_data.append([idx, slice, "#Pad", y[idx]])
-    results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
-    print("Slicing done, {} windows generated".format(results_df.shape[0]))
-    return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
-
-
-
-def bgl_preprocess_data(para, raw_data, event_mapping_data):
-    """ split logs into sliding windows, built an event count matrix and get the corresponding label
-
-    Args:
-    --------
-    para: the parameters dictionary
-    raw_data: list of (label, time)
-    event_mapping_data: a list of event index, where each row index indicates a corresponding log
-
-    Returns:
-    --------
-    event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
-    labels: a list of labels, 1 represents anomaly
-    """
-
-    # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
-    if not os.path.exists(para['save_path']):
-        os.mkdir(para['save_path'])
-    log_size = raw_data.shape[0]
-    sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
-
-    #=============divide into sliding windows=========#
-    start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
-    label_data, time_data = raw_data[:,0], raw_data[:, 1]
-    if not os.path.exists(sliding_file_path):
-        # split into sliding window
-        start_time = time_data[0]
-        start_index = 0
-        end_index = 0
-
-        # get the first start, end index, end time
-        for cur_time in time_data:
-            if  cur_time < start_time + para['window_size']*3600:
-                end_index += 1
-                end_time = cur_time
-            else:
-                start_end_pair=tuple((start_index,end_index))
-                start_end_index_list.append(start_end_pair)
-                break
-        # move the start and end index until next sliding window
-        while end_index < log_size:
-            start_time = start_time + para['step_size']*3600
-            end_time = end_time + para['step_size']*3600
-            for i in range(start_index,end_index):
-                if time_data[i] < start_time:
-                    i+=1
-                else:
-                    break
-            for j in range(end_index, log_size):
-                if time_data[j] < end_time:
-                    j+=1
-                else:
-                    break
-            start_index = i
-            end_index = j
-            start_end_pair = tuple((start_index, end_index))
-            start_end_index_list.append(start_end_pair)
-        inst_number = len(start_end_index_list)
-        print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
-        np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
-    else:
-        print('Loading start_end_index_list from file')
-        start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
-        inst_number = len(start_end_index_list)
-        print('there are %d instances (sliding windows) in this dataset' % inst_number)
-
-    # get all the log indexes in each time window by ranging from start_index to end_index
-    expanded_indexes_list=[]
-    for t in range(inst_number):
-        index_list = []
-        expanded_indexes_list.append(index_list)
-    for i in range(inst_number):
-        start_index = start_end_index_list[i][0]
-        end_index = start_end_index_list[i][1]
-        for l in range(start_index, end_index):
-            expanded_indexes_list[i].append(l)
-
-    event_mapping_data = [row[0] for row in event_mapping_data]
-    event_num = len(list(set(event_mapping_data)))
-    print('There are %d log events'%event_num)
-
-    #=============get labels and event count of each sliding window =========#
-    labels = []
-    event_count_matrix = np.zeros((inst_number,event_num))
-    for j in range(inst_number):
-        label = 0   #0 represent success, 1 represent failure
-        for k in expanded_indexes_list[j]:
-            event_index = event_mapping_data[k]
-            event_count_matrix[j, event_index] += 1
-            if label_data[k]:
-                label = 1
-                continue
-        labels.append(label)
-    assert inst_number == len(labels)
-    print("Among all instances, %d are anomalies"%sum(labels))
-    assert event_count_matrix.shape[0] == len(labels)
-    return event_count_matrix, labels
diff --git a/loglizer/dataloader/Thunderbird.py b/loglizer/dataloader/Thunderbird.py
index 6497ad0..2b11e55 100644
--- a/loglizer/dataloader/Thunderbird.py
+++ b/loglizer/dataloader/Thunderbird.py
@@ -1,5 +1,5 @@
 """
-The interface to load HDFS log datasets.
+The interface to load Thunderbird log datasets.
 
 Authors:
     Hans Aschenloher
@@ -38,7 +38,7 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
 
     # Load the file and sort lines according to time.
     df = pd.read_csv(log_file)
-    #df['Time'] = pd.to_datetime( str(df['Month'])+" " + str(df['Day']) + " " + str(df['Time']), format="%b %d %H:%M:%S")
+    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f")
     df = df.sort_values(by="Timestamp")
     df.reset_index(drop=True, inplace=True)
     df['LineId'] = range(0, df.shape[0])
@@ -89,101 +89,3 @@ def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
           .format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))
 
     return (x_train, y_train), (x_test, y_test)
-
-def thunderbird_preprocess_data(para, raw_data, event_mapping_data):
-    """ split logs into sliding windows, built an event count matrix and get the corresponding label
-
-    Args:
-    --------
-    para: the parameters dictionary
-    raw_data: list of (label, time)
-    event_mapping_data: a list of event index, where each row index indicates a corresponding log
-
-    Returns:
-    --------
-    event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
-    labels: a list of labels, 1 represents anomaly
-    """
-
-    # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
-    if not os.path.exists(para['save_path']):
-        os.mkdir(para['save_path'])
-    log_size = raw_data.shape[0]
-    sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
-
-    #=============divide into sliding windows=========#
-    start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
-    label_data, time_data = raw_data[:,0], raw_data[:, 1]
-    if not os.path.exists(sliding_file_path):
-        # split into sliding window
-        start_time = time_data[0]
-        start_index = 0
-        end_index = 0
-
-        # get the first start, end index, end time
-        for cur_time in time_data:
-            if  cur_time < start_time + para['window_size']*3600:
-                end_index += 1
-                end_time = cur_time
-            else:
-                start_end_pair=tuple((start_index,end_index))
-                start_end_index_list.append(start_end_pair)
-                break
-        # move the start and end index until next sliding window
-        while end_index < log_size:
-            start_time = start_time + para['step_size']*3600
-            end_time = end_time + para['step_size']*3600
-            for i in range(start_index,end_index):
-                if time_data[i] < start_time:
-                    i+=1
-                else:
-                    break
-            for j in range(end_index, log_size):
-                if time_data[j] < end_time:
-                    j+=1
-                else:
-                    break
-            start_index = i
-            end_index = j
-            start_end_pair = tuple((start_index, end_index))
-            start_end_index_list.append(start_end_pair)
-        inst_number = len(start_end_index_list)
-        print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
-        np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
-    else:
-        print('Loading start_end_index_list from file')
-        start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
-        inst_number = len(start_end_index_list)
-        print('there are %d instances (sliding windows) in this dataset' % inst_number)
-
-    # get all the log indexes in each time window by ranging from start_index to end_index
-    expanded_indexes_list=[]
-    for t in range(inst_number):
-        index_list = []
-        expanded_indexes_list.append(index_list)
-    for i in range(inst_number):
-        start_index = start_end_index_list[i][0]
-        end_index = start_end_index_list[i][1]
-        for l in range(start_index, end_index):
-            expanded_indexes_list[i].append(l)
-
-    event_mapping_data = [row[0] for row in event_mapping_data]
-    event_num = len(list(set(event_mapping_data)))
-    print('There are %d log events'%event_num)
-
-    #=============get labels and event count of each sliding window =========#
-    labels = []
-    event_count_matrix = np.zeros((inst_number,event_num))
-    for j in range(inst_number):
-        label = 0   #0 represent success, 1 represent failure
-        for k in expanded_indexes_list[j]:
-            event_index = event_mapping_data[k]
-            event_count_matrix[j, event_index] += 1
-            if label_data[k]:
-                label = 1
-                continue
-        labels.append(label)
-    assert inst_number == len(labels)
-    print("Among all instances, %d are anomalies"%sum(labels))
-    assert event_count_matrix.shape[0] == len(labels)
-    return event_count_matrix, labels

From a7fe3de7d3baf1486e74835acd172342da0572fc Mon Sep 17 00:00:00 2001
From: Johann Aschenloher <ja@safri.net>
Date: Fri, 11 Feb 2022 08:26:23 +0100
Subject: [PATCH 7/7] Update benchmark in respect to the dataloder changes

---
 benchmarks/HDFS_bechmark.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/benchmarks/HDFS_bechmark.py b/benchmarks/HDFS_bechmark.py
index 788a8b4..c36dc35 100644
--- a/benchmarks/HDFS_bechmark.py
+++ b/benchmarks/HDFS_bechmark.py
@@ -5,27 +5,28 @@
 sys.path.append('../')
 import pandas as pd
 from loglizer.models import *
-from loglizer import dataloader, preprocessing
+from loglizer import preprocessing
+from loglizer.dataloader import HDFS
 
-run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR', 
+run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR',
               'SVM', 'DecisionTree']
 struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset
 
 if __name__ == '__main__':
-    (x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
-                                                           window='session', 
-                                                           train_ratio=0.5,
-                                                           split_type='uniform')
+    (x_tr, y_train), (x_te, y_test) = HDFS.loadDataset(struct_log,
+                                                window='session',
+                                                train_ratio=0.5,
+                                                split_type='uniform')
     benchmark_results = []
     for _model in run_models:
         print('Evaluating {} on HDFS:'.format(_model))
         if _model == 'PCA':
             feature_extractor = preprocessing.FeatureExtractor()
-            x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf', 
+            x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf',
                                                       normalization='zero-mean')
             model = PCA()
             model.fit(x_train)
-        
+
         elif _model == 'InvariantsMiner':
             feature_extractor = preprocessing.FeatureExtractor()
             x_train = feature_extractor.fit_transform(x_tr)
@@ -41,7 +42,7 @@
         elif _model == 'IsolationForest':
             feature_extractor = preprocessing.FeatureExtractor()
             x_train = feature_extractor.fit_transform(x_tr)
-            model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03, 
+            model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03,
                                     n_jobs=4)
             model.fit(x_train)
 
@@ -62,7 +63,7 @@
             x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
             model = DecisionTree()
             model.fit(x_train, y_train)
-        
+
         x_test = feature_extractor.transform(x_te)
         print('Train accuracy:')
         precision, recall, f1 = model.evaluate(x_train, y_train)