From c7aae9f4386cb89bbf9976ab439672cf98a40d83 Mon Sep 17 00:00:00 2001
From: Kamesh Akella <kakella@redhat.com>
Date: Fri, 9 Aug 2024 03:57:40 -0400
Subject: [PATCH] add new perfInsights.py script to analyse the data generated
 in result_data branch (#905)

Signed-off-by: Kamesh Akella <kamesh.asp@gmail.com>
Signed-off-by: Alexander Schwartz <aschwart@redhat.com>
Co-authored-by: Alexander Schwartz <aschwart@redhat.com>
---
 .gitignore                                    |   5 +
 benchmark/src/main/python/README.adoc         |   3 +
 benchmark/src/main/python/perfInsights.py     | 140 ++++++++++++++++++
 benchmark/src/main/python/requirements.txt    |   4 +-
 .../ROOT/pages/util/perf-insights.adoc        |  45 ++++++
 .../modules/ROOT/partials/util-nav.adoc       |   1 +
 6 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/src/main/python/README.adoc
 create mode 100644 benchmark/src/main/python/perfInsights.py
 create mode 100644 doc/kubernetes/modules/ROOT/pages/util/perf-insights.adoc

diff --git a/.gitignore b/.gitignore
index 61d503409..8ee40c44a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,11 @@ gatling-charts-*
 # Grafana report pdfs
 benchmark/src/main/python/grafana_report_pdfs
 
+# Python files
+benchmark/src/main/python/venv
+benchmark/src/main/python/results
+benchmark/src/main/python/perf_insights.log
+
 # Intellij
 ###################
 .idea
diff --git a/benchmark/src/main/python/README.adoc b/benchmark/src/main/python/README.adoc
new file mode 100644
index 000000000..63a47cc5a
--- /dev/null
+++ b/benchmark/src/main/python/README.adoc
@@ -0,0 +1,3 @@
+To find out more about the Grafana dashboard-to-PDF script `snapGrafana.py`, visit https://www.keycloak.org/keycloak-benchmark/kubernetes-guide/latest/util/grafana#snapgrafana-py-cli-options
+
+To find out more about the performance analysis script `perfInsights.py`, visit https://www.keycloak.org/keycloak-benchmark/kubernetes-guide/latest/util/perf-insights
diff --git a/benchmark/src/main/python/perfInsights.py b/benchmark/src/main/python/perfInsights.py
new file mode 100644
index 000000000..4d454c654
--- /dev/null
+++ b/benchmark/src/main/python/perfInsights.py
@@ -0,0 +1,140 @@
+import pandas as pd
+import requests
+import argparse
+from pandas import json_normalize
+import logging
+import json
+
+def setup_logger(log_file):
+    # Set up logging to a file
+    logging.basicConfig(filename=log_file, level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    logger = logging.getLogger()
+    return logger
+
+def fetch_and_process_json(github_user, github_repo, branch_name, json_directory, logger):
+    # GitHub API URL to list files in the directory on a specific branch
+    api_url = f'https://api.github.com/repos/{github_user}/{github_repo}/contents/{json_directory}?ref={branch_name}'
+
+    # Fetch the list of files in the directory
+    response = requests.get(api_url)
+    files = response.json()
+
+    # Dictionary to store DataFrames for each test
+    data_frames = {
+        'memoryUsageTest': [],
+        'cpuUsageForLoginsTest': [],
+        'cpuUsageForCredentialGrantsTest': []
+    }
+
+    basic_df = []
+
+    # Fetch each JSON file and append to the corresponding list
+    for file in files:
+        if file['name'].endswith('.json'):
+            file_url = file['download_url']
+            file_response = requests.get(file_url)
+            file_json = file_response.json()
+            df = pd.json_normalize(file_json)
+            basic_df.append(df)
+
+            # Debug: log the JSON content
+            logger.debug("Processing file: %s", file['name'])
+            logger.debug("JSON content: %s", json.dumps(file_json, indent=2))
+
+            # Normalize the JSON to extract specific fields for each test
+            for test in data_frames.keys():
+                if test in file_json:
+                    df = json_normalize(
+                        file_json,
+                        record_path=[test, 'statistics'],
+                        meta=[
+                            'start',
+                            'context',
+                            [test, 'activeSessionsPer500MbPerPod'],
+                            [test, 'userLoginsPerSecPer1vCpuPerPod'],
+                            [test, 'credentialGrantsPerSecPer1vCpu']
+                        ],
+                        record_prefix=f'{test}.',
+                        errors='ignore'
+                    )
+                    data_frames[test].append(df)
+
+    combined_df = pd.concat(basic_df, ignore_index=True)
+    perf_across_deployments_df = combined_df[['start', 'context.externalInfinispanFeatureEnabled', 'context.persistentSessionsEnabled', 'cpuUsageForLoginsTest.userLoginsPerSecPer1vCpuPerPod', 'credentialGrantsPerSecPer1vCpu', 'memoryUsageTest.activeSessionsPer500MbPerPod']]
+
+    print(perf_across_deployments_df.to_csv(index=False))
+    # Concatenate all DataFrames for each test into a single DataFrame
+    combined_data_frames = {test: pd.concat(data_frames[test], ignore_index=True) for test in data_frames}
+
+    # Log the columns of the combined DataFrames
+    for test, df in combined_data_frames.items():
+        logger.debug(f"{test} DataFrame columns: {df.columns.tolist()}")
+        logger.debug(f"{test} DataFrame sample: {df.head()}")
+
+    return combined_data_frames
+
+def save_to_csv(data_frames, json_directory, output_directory):
+    # Columns to include in the final CSVs for each test
+    columns_to_include = {
+        'memoryUsageTest': [
+            'start',
+            'context',
+            'memoryUsageTest.name',
+            'memoryUsageTest.activeSessionsPer500MbPerPod',
+            'memoryUsageTest.meanResponseTime.total',
+            'memoryUsageTest.percentiles1.total',
+            'memoryUsageTest.meanNumberOfRequestsPerSecond.total'
+        ],
+        'cpuUsageForLoginsTest': [
+            'start',
+            'context',
+            'cpuUsageForLoginsTest.name',
+            'cpuUsageForLoginsTest.userLoginsPerSecPer1vCpuPerPod',
+            'cpuUsageForLoginsTest.meanResponseTime.total',
+            'cpuUsageForLoginsTest.percentiles1.total',
+            'cpuUsageForLoginsTest.meanNumberOfRequestsPerSecond.total'
+        ],
+        'cpuUsageForCredentialGrantsTest': [
+            'start',
+            'context',
+            'cpuUsageForCredentialGrantsTest.name',
+            'cpuUsageForCredentialGrantsTest.credentialGrantsPerSecPer1vCpu',
+            'cpuUsageForCredentialGrantsTest.meanResponseTime.total',
+            'cpuUsageForCredentialGrantsTest.percentiles1.total',
+            'cpuUsageForCredentialGrantsTest.meanNumberOfRequestsPerSecond.total'
+        ]
+    }
+
+    for test, df in data_frames.items():
+        # Reorder columns to include only the desired ones
+        available_columns = [col for col in columns_to_include[test] if col in df.columns]
+        df = df[available_columns]
+
+        test_date = json_directory.replace("/", "_")
+        # Save to CSV
+        csv_file_path = f"{output_directory}/{test_date}_{test}_results.csv"
+        df.to_csv(csv_file_path, index=False)
+        print(f"Saved {test} results to {csv_file_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Process JSON files from a GitHub repository.")
+    parser.add_argument('json_directory', type=str, help='The directory in the GitHub repository containing JSON files.')
+    parser.add_argument('output_directory', type=str, help='The directory to save the CSV files.')
+    args = parser.parse_args()
+
+    github_user = 'keycloak'
+    github_repo = 'keycloak-benchmark'
+    branch_name = 'result-data'
+    json_directory = args.json_directory
+    output_directory = args.output_directory
+
+    # Set up logger
+    log_file = 'perf_insights.log'
+    logger = setup_logger(log_file)
+
+    data_frames = fetch_and_process_json(github_user, github_repo, branch_name, json_directory, logger)
+    save_to_csv(data_frames, json_directory, output_directory)
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/src/main/python/requirements.txt b/benchmark/src/main/python/requirements.txt
index db6d20034..d0bfbe8de 100644
--- a/benchmark/src/main/python/requirements.txt
+++ b/benchmark/src/main/python/requirements.txt
@@ -1,4 +1,6 @@
-playwright==1.37.0
+pandas==2.2.2
+playwright==1.45.1
+Requests==2.32.3
 asyncio==3.4.3
 typing==3.7.4.3
 typing_extensions==4.7.1
diff --git a/doc/kubernetes/modules/ROOT/pages/util/perf-insights.adoc b/doc/kubernetes/modules/ROOT/pages/util/perf-insights.adoc
new file mode 100644
index 000000000..e86b6e45e
--- /dev/null
+++ b/doc/kubernetes/modules/ROOT/pages/util/perf-insights.adoc
@@ -0,0 +1,45 @@
+= Analyzing the Continuous Performance Test results
+:description: A utility to perform custom analysis on the generated results from continuous performance tests.
+
+{description}
+
+== Continuous Performance Tests
+The link:{github-files}/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml[ROSA Daily Scheduled Run workflow] is an automated process that ensures continuous performance testing which is scheduled to run every weekday.
+
+This workflow initiates by deploying a multi-AZ cluster, activating features such as external Infinispan and persistent sessions. It executes a series of functional tests to verify the system's performance and stability in an active-active configuration.
+
+Following these tests, a scaling benchmark assesses the system's ability to handle varying loads, providing crucial insights into performance under real-world conditions and the results are then persisted to a `https://github.com/keycloak/keycloak-benchmark/tree/result-data/rosa_scalability[result-data branch]` in the benchmark's GitHub repository for further analysis.
+
+This automated schedule ensures consistent testing, early detection of potential issues, and continuous improvement of {project_name}'s performance.
+
+== Analyze the results
+
+We have a Python script `link:{github-files}/benchmark/src/main/python/perfInsights.py[perfInsights.py]` which allows us to analyze the results generated from the Continuous Performance tests mentioned above.
+
+=== Installing needed python libraries
+
+[source,bash]
+----
+pip3 install -U -r requirements.txt
+----
+
+And we can check if all the requirements are satisfied using the below command.
+[source,bash]
+----
+python3 -m pip check
+----
+
+=== Usage
+
+Run the below command to access the results from the `https://github.com/keycloak/keycloak-benchmark/tree/result-data/rosa_scalability[result-data branch]` and save the analysis to a local dir.
+
+[source, bash]
+----
+python3 perfInsights.py <result_json_dir_path_on_github> <output_dir_path>
+----
+
+[source, bash]
+----
+Example:
+python3 perfInsights.py rosa_scalability/2024/07/23 results
+----
diff --git a/doc/kubernetes/modules/ROOT/partials/util-nav.adoc b/doc/kubernetes/modules/ROOT/partials/util-nav.adoc
index c448d32c9..bf5f703ef 100644
--- a/doc/kubernetes/modules/ROOT/partials/util-nav.adoc
+++ b/doc/kubernetes/modules/ROOT/partials/util-nav.adoc
@@ -1,5 +1,6 @@
 ** xref:util/sqlpad.adoc[]
 ** xref:util/grafana.adoc[]
+** xref:util/perf-insights.adoc[]
 ** xref:util/prometheus.adoc[]
 ** xref:util/otel.adoc[]
 ** xref:util/debugging-keycloak.adoc[]