Changed the importer, adding support for config files, etc

google · May 7, 2020 · 36952df · 36952df
2 parents 4db84f8 + e66fc4f
commit 36952df
Show file tree

Hide file tree

Showing 7 changed files with 446 additions and 66 deletions.
diff --git a/importer_client/python/setup.py b/importer_client/python/setup.py
@@ -24,7 +24,7 @@
 
 setup(
     name='timesketch-import-client',
-    version='20200505',
+    version='20200507',
     description='Timesketch Import Client',
     license='Apache License, Version 2.0',
     url='http://www.timesketch.org/',
@@ -36,6 +36,10 @@
         'Operating System :: OS Independent',
         'Programming Language :: Python',
     ],
+    data_files=[
+        ('data', glob.glob(
+            os.path.join('timesketch_import_client', 'data', '*.yaml'))),
+    ],
     packages=find_packages(),
     include_package_data=True,
     zip_safe=False,

diff --git a/importer_client/python/timesketch_import_client/data/__init__.py b/importer_client/python/timesketch_import_client/data/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Timesketch data import configuration."""
+from __future__ import unicode_literals
+
+import codecs
+import logging
+import os
+
+import yaml
+
+
+logger = logging.getLogger('import_client_config_loader')
+
+DEFAULT_FILE = 'formatter.yaml'
+
+
+def load_config(file_path=''):
+    """Loads YAML config and returns a list of dict with the results.
+
+    Args:
+        file_path (str): path to the YAML config file. This is optional
+            and if not defined the default formatter.yaml file will be
+            used that comes with the tool.
+
+    Returns:
+        A list with dicts containing the loaded YAML config.
+    """
+    if not file_path:
+        base_path = os.path.dirname(__file__)
+        file_path = os.path.join(base_path, DEFAULT_FILE)
+
+    if not file_path.endswith('.yaml'):
+        logger.error('Can\'t load a config that is not a YAML file.')
+        return []
+
+    if not os.path.isfile(file_path):
+        logger.error('File path does not exist, unable to load YAML config.')
+        return []
+
+    with codecs.open(file_path, 'r') as fh:
+        try:
+            data = yaml.safe_load(fh)
+        except (AttributeError, yaml.parser.ParserError) as e:
+            logger.error('Unable to parse YAML file, with error: %s', e)
+            return []
+        if not data:
+            return []
+        return data
diff --git a/importer_client/python/timesketch_import_client/data/formatter.yaml b/importer_client/python/timesketch_import_client/data/formatter.yaml
@@ -0,0 +1,65 @@
+# This YAML file defines how to process log files.
+# The parameters here can be defined either as arguments
+# to the import streamer, as a separate config file
+# or in this default config.
+#
+# The format of the file is:
+#
+# name:
+#       message: '<format_string>'
+#       timestamp_desc: <description>
+#       datetime: <column with time information>
+#       separator: <if a csv file then the separator>
+#       encoding: <encoding>
+#       data_type: <data_type>
+#       columns: <list_of_columns>
+#       columns_subset: <list_of_columns>
+#
+# The config fields are either used for configuring the streamer or for
+# identifying the file. The file identifier is either the use of a data_type
+# or a list of available columns in the file.
+#
+# Configuration parameters:
+#       message - this is the format string for the message attribute. It can
+#                 consist of a string with curly brackets for variable
+#                 expansion, eg: "User {user} visited {url}", would generate
+#                 a message string where the attributes "user" and "url" will
+#                 get expanded.
+#
+#       timestamp_desc - a string value that will be used to set the timestamp
+#                 description field.
+#
+#       datetime - if there is no column called datetime this config can be set
+#                  to tell the tool in what column the date information is
+#                  stored. Otherwise the tool will attempt to guess based on
+#                  column names.
+#
+#       separator - only applicable for CSV files, if the separator is not a
+#                   comma (,) then this variable can be set to indicate the
+#                   separator value.
+#
+#       encoding - if the file encoding is not UTF-8 it can be set here.
+#
+# Identification parameters:
+#       data_type - this can be used if there is a field in the dataset that
+#                   is called "data_type". There can only be one value of
+#                   "data_type" in the data set for it to be matched on.
+#
+#       columns - a list of columns that should be present in the data file
+#                 for this to be a match on. It should be noted that all the
+#                 columns need to be present and no extra columnns should be
+#                 in the log file. If there may be extra columns then use
+#                 columns_subset instead.
+#
+#       columns_subset - a list of columns that should be present in the data
+#                        file. This list defines a subset of the columns that
+#                        can be present. To match all of the columns here
+#                        need to be present, yet there may be extra columns
+#                        present in the output file.
+
+redline:
+        message: 'User: {UniqueUsername}, with event type: {EventType} => {Summary1} - {Summary2} - {Summary3}'
+        timestamp_desc: 'Event Logged'
+        datetime: 'EventTimestamp'
+        columns_subset: 'EventTimestamp,EventType,AuditType,Summary1,Summary2,Summary3,UniqueUsername'
+