Merge branch 'master' into fix-feature-extraction-data

google · Nov 13, 2023 · 9a78e6f · 9a78e6f
2 parents beaa7d9 + d492c87
commit 9a78e6f
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 26 deletions.
diff --git a/docs/developers/testing.md b/docs/developers/testing.md
@@ -41,13 +41,13 @@ Switch to:
 And execute the single test
 
 ```shell
-! nosetests timesketch/lib/emojis_test.py -v
+! python3 -m pytest timesketch/lib/emojis_test.py -v
 ```
 
 Or all in one:
 
 ```bash
-$ sudo docker exec -it $CONTAINER_ID nosetests /usr/local/src/timesketch/timesketch/lib/emojis_test.py -v
+$ sudo docker exec -it $CONTAINER_ID python3 -m pytest /usr/local/src/timesketch/timesketch/lib/emojis_test.py -v
 ```
 
 ## Writing unittests
@@ -71,7 +71,7 @@ breakpoint()
 And then within the docker container execute
 
 ```shell
-! nosetests /usr/local/src/timesketchtimesketch/lib/emojis_test.py -s -pdb
+! python3 -m pytest /usr/local/src/timesketch/timesketch/lib/emojis_test.py -s -pdb
 ```
 
 ## end2end tests
@@ -104,8 +104,8 @@ The following example is for changing / adding tests to `client_test.py`
 ```shell
 $ export CONTAINER_ID="$(sudo -E docker container list -f name=e2e_timesketch -q)"
 $ docker exec -it $CONTAINER_ID /bin/bash
-! rm /usr/local/lib/python3.8/dist-packages/end_to_end_tests/client_test.py
-! ln -s /usr/local/src/timesketch/end_to_end_tests/client_test.py /usr/local/lib/python3.8/dist-packages/end_to_end_tests/client_test.py
+! rm /usr/local/lib/python3.10/dist-packages/end_to_end_tests/client_test.py
+! ln -s /usr/local/src/timesketch/end_to_end_tests/client_test.py /usr/local/lib/python3.10/dist-packages/end_to_end_tests/client_test.py
 ```
 
 From now on you can edit the `client_test.py` file outside of the docker instance and run it again with

diff --git a/end_to_end_tests/interface.py b/end_to_end_tests/interface.py
@@ -61,27 +61,35 @@ def __init__(self):
         self._counter = collections.Counter()
         self._imported_files = []
 
-    def import_timeline(self, filename, index_name=None):
+    def import_timeline(self, filename, index_name=None, sketch=None):
         """Import a Plaso, CSV or JSONL file.
 
         Args:
             filename (str): Filename of the file to be imported.
+            index_name (str): The OpenSearch index to store the documents in.
+            sketch (Sketch): Optional sketch object to add the timeline to.
+                        if no sketch is provided, the default sketch is used.
 
         Raises:
             TimeoutError if import takes too long.
         """
+        if not sketch:
+            sketch = self.sketch
         if filename in self._imported_files:
             return
         file_path = os.path.join(TEST_DATA_DIR, filename)
         if not index_name:
             index_name = uuid.uuid4().hex
 
         with importer.ImportStreamer() as streamer:
-            streamer.set_sketch(self.sketch)
+            streamer.set_sketch(sketch)
             streamer.set_timeline_name(file_path)
             streamer.set_index_name(index_name)
+            streamer.set_provider("e2e test interface")
             streamer.add_file(file_path)
             timeline = streamer.timeline
+            if not timeline:
+                print("Error creating timeline, please try again.")
 
         # Poll the timeline status and wait for the timeline to be ready
         max_time_seconds = 600  # Timeout after 10min
@@ -91,8 +99,18 @@ def import_timeline(self, filename, index_name=None):
         while True:
             if retry_count >= max_retries:
                 raise TimeoutError
-            _ = timeline.lazyload_data(refresh_cache=True)
-            status = timeline.status
+
+            try:
+                if not timeline:
+                    print("Error no timeline yet, trying to get the new one")
+                    timeline = streamer.timeline
+                _ = timeline.lazyload_data(refresh_cache=True)
+                status = timeline.status
+            except AttributeError:
+                # The timeline is not ready yet, so we need to wait
+                retry_count += 1
+                time.sleep(sleep_time_seconds)
+                continue
 
             if not timeline.index:
                 retry_count += 1
@@ -101,7 +119,9 @@ def import_timeline(self, filename, index_name=None):
 
             if status == "fail" or timeline.index.status == "fail":
                 if retry_count > 3:
-                    raise RuntimeError("Unable to import timeline.")
+                    raise RuntimeError(
+                        f"Unable to import timeline {timeline.index.id}."
+                    )
 
             if status == "ready" and timeline.index.status == "ready":
                 break
@@ -133,7 +153,8 @@ def import_directly_to_opensearch(self, filename, index_name):
             raise ValueError("File [{0:s}] does not exist.".format(file_path))
 
         es = opensearchpy.OpenSearch(
-            [{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}], http_compress=True
+            [{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
+            http_compress=True,
         )
 
         df = pd.read_csv(file_path, on_bad_lines="warn")
@@ -143,7 +164,11 @@ def import_directly_to_opensearch(self, filename, index_name):
         def _pandas_to_opensearch(data_frame):
             for _, row in data_frame.iterrows():
                 row.dropna(inplace=True)
-                yield {"_index": index_name, "_type": "_doc", "_source": row.to_dict()}
+                yield {
+                    "_index": index_name,
+                    "_type": "_doc",
+                    "_source": row.to_dict(),
+                }
 
         if os.path.isfile(OPENSEARCH_MAPPINGS_FILE):
             mappings = {}

diff --git a/end_to_end_tests/upload_test.py b/end_to_end_tests/upload_test.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """End to end tests of Timesketch upload functionality."""
+import os
+import random
 
+from timesketch_api_client import search
 from . import interface
 from . import manager
 
@@ -27,5 +30,205 @@ def test_invalid_index_name(self):
         with self.assertions.assertRaises(RuntimeError):
             self.import_timeline("evtx.plaso", index_name="/invalid/index/name")
 
+    def test_normal_upload_json(self):
+        """Test the upload of a json file with a few events."""
+        # create a new sketch
+        rand = random.randint(0, 10000)
+        sketch = self.api.create_sketch(name=f"test_normal_upload_json {rand}")
+        self.sketch = sketch
+
+        file_path = (
+            "/usr/local/src/timesketch/end_to_end_tests/test_data/sigma_events.jsonl"
+        )
+        self.import_timeline(file_path, index_name=rand, sketch=sketch)
+        timeline = sketch.list_timelines()[0]
+        # check that timeline was uploaded correctly
+        self.assertions.assertEqual(timeline.name, file_path)
+        self.assertions.assertEqual(timeline.index.name, str(rand))
+        self.assertions.assertEqual(timeline.index.status, "ready")
+
+        events = sketch.explore("*", as_pandas=True)
+        self.assertions.assertEqual(len(events), 4)
+
+    def test_large_upload_jsonl(self):
+        """Test uploading a timeline with a lot of events as jsonl. The test
+        will create a temporary file with a large number of events and then
+        upload the file to Timesketch. The test will then check that the
+        number of events in the timeline is correct."""
+
+        # create a new sketch
+        rand = random.randint(0, 10000)
+        sketch = self.api.create_sketch(name=f"test_large_upload_json {rand}")
+        self.sketch = sketch
+
+        file_path = "/tmp/large.jsonl"
+
+        with open(file_path, "w", encoding="utf-8") as file_object:
+            for i in range(4123):
+                string = f'{{"message":"Count {i} {rand}","timestamp":"123456789","datetime":"2015-07-24T19:01:01+00:00","timestamp_desc":"Write time","data_type":"foobarjson"}}\n'  # pylint: disable=line-too-long
+                file_object.write(string)
+
+        self.import_timeline("/tmp/large.jsonl", index_name=rand, sketch=sketch)
+        os.remove(file_path)
+
+        timeline = sketch.list_timelines()[0]
+        # check that timeline was uploaded correctly
+        self.assertions.assertEqual(timeline.name, file_path)
+        self.assertions.assertEqual(timeline.index.name, str(rand))
+        self.assertions.assertEqual(timeline.index.status, "ready")
+
+        search_obj = search.Search(sketch)
+        search_obj.query_string = "data_type:foobarjson"
+        search_obj.commit()
+        self.assertions.assertEqual(len(search_obj.table), 4123)
+
+        # check that the number of events is correct with a different method
+        events = sketch.explore("data_type:foobarjson", as_pandas=True)
+        self.assertions.assertEqual(len(events), 4123)
+
+    def test_very_large_upload_jsonl(self):
+        """Test uploading a timeline with over 50 k events as jsonl. The test
+        will create a temporary file and then
+        upload the file to Timesketch. The test will check that the
+        number of events in the timeline is correct."""
+
+        # create a new sketch
+        rand = random.randint(0, 10000)
+        sketch = self.api.create_sketch(name=f"test__very_large_upload_json {rand}")
+        self.sketch = sketch
+
+        file_path = "/tmp/verylarge.jsonl"
+
+        with open(file_path, "w", encoding="utf-8") as file_object:
+            for i in range(74251):
+                string = f'{{"message":"Count {i} {rand}","timestamp":"123456789","datetime":"2015-07-24T19:01:01+00:00","timestamp_desc":"Write time","data_type":"foobarjsonverlarge"}}\n'  # pylint: disable=line-too-long
+                file_object.write(string)
+
+        self.import_timeline(file_path, index_name=rand, sketch=sketch)
+        os.remove(file_path)
+
+        timeline = sketch.list_timelines()[0]
+        # check that timeline was uploaded correctly
+        self.assertions.assertEqual(timeline.name, file_path)
+        self.assertions.assertEqual(timeline.index.name, str(rand))
+        self.assertions.assertEqual(timeline.index.status, "ready")
+
+        search_obj = search.Search(sketch)
+        search_obj.query_string = "data_type:foobarjsonverlarge"
+        search_obj.commit()
+
+        # normal max query limit
+        self.assertions.assertEqual(len(search_obj.table), 10000)
+        self.assertions.assertEqual(search_obj.expected_size, 74251)
+
+        # increase max entries returned:
+        search_obj.max_entries = 100000
+        search_obj.commit()
+        self.assertions.assertEqual(len(search_obj.table), 74251)
+
+        # check that the number of events is correct with a different method
+        events = sketch.explore(
+            "data_type:foobarjsonverlarge", as_pandas=True, max_entries=100000
+        )
+        self.assertions.assertEqual(len(events), 74251)
+
+    def test_large_upload_csv(self):
+        """Test uploading a timeline with an a lot of events.
+        The test will create a temporary file with a large number of events
+        and then upload the file to Timesketch.
+        The test will then check that the number of events in the timeline
+        is correct."""
+
+        # create a new sketch
+        rand = random.randint(0, 10000)
+        sketch = self.api.create_sketch(name=rand)
+        self.sketch = sketch
+
+        file_path = "/tmp/large.csv"
+
+        with open(file_path, "w", encoding="utf-8") as file_object:
+            file_object.write(
+                '"message","timestamp","datetime","timestamp_desc","data_type"\n'
+            )
+
+            for i in range(3251):
+                # write a line with random values for message
+                string = (
+                    f'"CSV Count: {i} {rand}","123456789",'
+                    '"2015-07-24T19:01:01+00:00","Write time","foobarcsv"\n'
+                )
+                file_object.write(string)
+
+        self.import_timeline("/tmp/large.csv", index_name=rand, sketch=sketch)
+        os.remove(file_path)
+
+        timeline = sketch.list_timelines()[0]
+        # check that timeline was uploaded correctly
+        self.assertions.assertEqual(timeline.name, file_path)
+        self.assertions.assertEqual(timeline.index.name, str(rand))
+        self.assertions.assertEqual(timeline.index.status, "ready")
+
+        search_obj = search.Search(sketch)
+        search_obj.query_string = "data_type:foobarcsv"
+        search_obj.commit()
+        self.assertions.assertEqual(len(search_obj.table), 3251)
+
+        # check that the number of events is correct with a different method
+        events = sketch.explore("data_type:foobarcsv", as_pandas=True)
+        self.assertions.assertEqual(len(events), 3251)
+
+    def test_large_upload_csv_over_flush_limit(self):
+        """Test uploading a timeline with an a lot of events > 50 k.
+        The test will create a temporary file with a large number of events
+        and then upload the file to Timesketch.
+        The test will then check that the number of events in the timeline
+        is correct."""
+
+        # create a new sketch
+        rand = random.randint(0, 10000)
+        sketch = self.api.create_sketch(name=rand)
+        self.sketch = sketch
+
+        file_path = "/tmp/verylarge.csv"
+
+        with open(file_path, "w", encoding="utf-8") as file_object:
+            file_object.write(
+                '"message","timestamp","datetime","timestamp_desc","data_type"\n'
+            )
+
+            for i in range(73251):
+                # write a line with random values for message
+                string = (
+                    f'"CSV Count: {i} {rand}","123456789",'
+                    '"2015-07-24T19:01:01+00:00","Write time","73kcsv"\n'
+                )
+                file_object.write(string)
+
+        self.import_timeline("/tmp/verylarge.csv", index_name=rand, sketch=sketch)
+        os.remove(file_path)
+
+        timeline = sketch.list_timelines()[0]
+        # check that timeline was uploaded correctly
+        self.assertions.assertEqual(timeline.name, file_path)
+        self.assertions.assertEqual(timeline.index.name, str(rand))
+        self.assertions.assertEqual(timeline.index.status, "ready")
+
+        search_obj = search.Search(sketch)
+        search_obj.query_string = "data_type:73kcsv"
+        search_obj.commit()
+
+        # normal max query limit
+        self.assertions.assertEqual(len(search_obj.table), 10000)
+        self.assertions.assertEqual(search_obj.expected_size, 73251)
+
+        # increase max entries returned:
+        search_obj.max_entries = 100000
+        search_obj.commit()
+        self.assertions.assertEqual(len(search_obj.table), 73251)
+
+        # check that the number of events is correct with a different method
+        events = sketch.explore("data_type:73kcsv", as_pandas=True, max_entries=100000)
+        self.assertions.assertEqual(len(events), 73251)
+
 
 manager.EndToEndTestManager.register_test(UploadTest)