From 2280d7d31911e5b7370f30e76f702b9dbf89bb43 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Thu, 7 Nov 2024 13:43:29 +0100 Subject: [PATCH] #291: Project improvements after 0.3.0 release * set credentials persistence to `false` in GitHub checkout actions * fixed release notes presence check GitHub workflow * added supported Atum Agent control functions list to documentation * added new grouping of issues into release notes draft * added badges to `README.md` --- .github/workflows/build.yml | 6 +- .github/workflows/check_pr_release_notes.yml | 90 ++++++++++++++++++ .github/workflows/format_check.yml | 3 +- .github/workflows/jacoco_report.yml | 2 + .github/workflows/license_check.yml | 4 +- .../pr_release_note_comment_check.yml | 94 ------------------- .github/workflows/release_draft.yml | 11 ++- .github/workflows/release_publish.yml | 2 + .github/workflows/test_filenames_check.yml | 4 +- README.md | 74 +++++++++++++++ 10 files changed, 190 insertions(+), 100 deletions(-) create mode 100644 .github/workflows/check_pr_release_notes.yml delete mode 100644 .github/workflows/pr_release_note_comment_check.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 31ee0e50b..2340472d6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,7 +30,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - uses: coursier/cache-action@v5 - name: Setup Scala @@ -64,7 +65,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - uses: coursier/cache-action@v5 - name: Setup Scala diff --git a/.github/workflows/check_pr_release_notes.yml b/.github/workflows/check_pr_release_notes.yml new file mode 100644 index 000000000..c4db978cd --- /dev/null +++ b/.github/workflows/check_pr_release_notes.yml @@ -0,0 +1,90 @@ +# +# Copyright 2021 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Check PR Release Notes in Description + +on: + pull_request: + types: [opened, synchronize, reopened, edited, labeled, unlabeled] + branches: [ master ] + +env: + SKIP_LABEL: 'no RN' + RLS_NOTES_TAG_REGEX: 'Release Notes:' + +jobs: + check-pr-release-notes: + runs-on: ubuntu-latest + + steps: + - name: Get Pull Request Info + id: pr_info + uses: actions/github-script@v7 + with: + script: | + const pr_number = context.payload.pull_request.number; + const pr = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: pr_number + }); + const labels = pr.data.labels ? pr.data.labels.map(label => label.name) : []; + + if (labels.includes("${{ env.SKIP_LABEL }}")) { + console.log("Skipping release notes check because '${{ env.SKIP_LABEL }}' label is present."); + core.setOutput("skip_check", 'true'); + core.setOutput("pr_body", ""); + return; + } + + const pr_body = pr.data.body; + if (!pr_body) { + core.setFailed("Pull request description is empty."); + core.setOutput("pr_body", ""); + core.setOutput("skip_check", 'false'); + return; + } + core.setOutput("pr_body", pr_body); + core.setOutput("skip_check", 'false'); + return; + + - name: Skip check if SKIP_LABEL is present + if: steps.pr_info.outputs.skip_check == 'true' + run: echo "Skipping release notes validation." + + - name: Check for 'Release Notes:' and bullet list + if: steps.pr_info.outputs.skip_check == 'false' + run: | + # Extract the body from the previous step + PR_BODY=$(cat <<-'EOF' + ${{ steps.pr_info.outputs.pr_body }} + EOF + ) + + # Check if "Release Notes:" exists + if ! echo "$PR_BODY" | grep -q '${{ env.RLS_NOTES_TAG_REGEX }}'; then + echo "Error: release notes tag not found in pull request description. Has to adhere to format '${{ env.RLS_NOTES_TAG_REGEX }}'." + exit 1 + fi + + # Extract text after "Release Notes:" line + TEXT_BELOW_RELEASE_NOTES_TAG=$(echo "$PR_BODY" | sed -n '/${{ env.RLS_NOTES_TAG_REGEX }}/,$p' | tail -n +2) + + # Check if there's a bullet list (lines starting with '-', '+' or '*') + if ! echo "$TEXT_BELOW_RELEASE_NOTES_TAG" | grep -qE '^\s*[-+*]\s+.+$'; then + echo "Error: No bullet list found under release notes tag." + exit 1 + fi diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml index a93ce1783..12090ccfe 100644 --- a/.github/workflows/format_check.yml +++ b/.github/workflows/format_check.yml @@ -27,8 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 ref: ${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/jacoco_report.yml b/.github/workflows/jacoco_report.yml index 80f08b2f2..0f3157b95 100644 --- a/.github/workflows/jacoco_report.yml +++ b/.github/workflows/jacoco_report.yml @@ -50,6 +50,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + persist-credentials: false - name: Setup Scala uses: olafurpg/setup-scala@v14 with: diff --git a/.github/workflows/license_check.yml b/.github/workflows/license_check.yml index 36a4f4d5f..3113d4886 100644 --- a/.github/workflows/license_check.yml +++ b/.github/workflows/license_check.yml @@ -27,7 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + persist-credentials: false - name: Setup Scala uses: olafurpg/setup-scala@v10 with: diff --git a/.github/workflows/pr_release_note_comment_check.yml b/.github/workflows/pr_release_note_comment_check.yml deleted file mode 100644 index 4dc08f526..000000000 --- a/.github/workflows/pr_release_note_comment_check.yml +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2021 ABSA Group Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -name: PR Release Note Comment Check - -on: - issue_comment: - types: - - created - - edited - - deleted - pull_request: - types: - - opened - - synchronize - - reopened - - edited - - labeled - - unlabeled - branches: [ master ] - -jobs: - check-for-release-notes-comments: - if: ${{ ( github.event_name == 'pull_request') || (github.event.issue.pull_request) }} - name: Check For Release Notes Comments - runs-on: ubuntu-latest - steps: - - name: Get PR branch - uses: xt0rted/pull-request-comment-branch@v1 - id: comment-branch - - - name: Set latest commit status as pending - uses: myrotvorets/set-commit-status-action@master - with: - sha: ${{ steps.comment-branch.outputs.head_sha }} - token: ${{ secrets.GITHUB_TOKEN }} - status: pending - - - name: Fetch all PR comments - if: ${{ ! contains( github.event.pull_request.labels.*.name, 'no RN') }} - id: get-comments - uses: actions/github-script@v7 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - const issueNumber = context.issue.number; - const repoName = context.repo.repo; - const repoOwner = context.repo.owner; - - const comments = await github.rest.issues.listComments({ - owner: repoOwner, - repo: repoName, - issue_number: issueNumber, - }); - - return comments.data.map(comment => comment.body); - - - name: Check for 'Release Notes' in comments - if: ${{ ! contains( github.event.pull_request.labels.*.name, 'no RN') }} - uses: actions/github-script@v7 - with: - script: | - const comments = ${{ steps.get-comments.outputs.result }}; - console.log("Comments:"); - console.log(comments); - const releaseNotesRegex = /release notes?:?/i; - const hasReleaseNotes = comments.some(comment => releaseNotesRegex.test(comment)); - - if (!hasReleaseNotes) { - console.log('No "Release notes" found in PR comments'); - core.setFailed('No "Release notes" found in PR comments') - } else { - console.log('"Release notes" found in comments'); - } - - name: Set latest commit status as ${{ job.status }} - uses: myrotvorets/set-commit-status-action@master - if: always() - with: - sha: ${{ steps.comment-branch.outputs.head_sha }} - token: ${{ secrets.GITHUB_TOKEN }} - status: ${{ job.status }} diff --git a/.github/workflows/release_draft.yml b/.github/workflows/release_draft.yml index aa303469c..95055bfa1 100644 --- a/.github/workflows/release_draft.yml +++ b/.github/workflows/release_draft.yml @@ -28,6 +28,7 @@ jobs: steps: - uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 # the following step is disabled because it doesn't order the version tags correctly # - name: Validate format of received tag @@ -104,6 +105,7 @@ jobs: steps: - uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 ref: refs/tags/${{ github.event.inputs.tagName }} @@ -119,10 +121,17 @@ jobs: with: tag-name: ${{ github.event.inputs.tagName }} chapters: '[ + {"title": "No entry 🚫", "label": "duplicate"}, + {"title": "No entry 🚫", "label": "invalid"}, + {"title": "No entry 🚫", "label": "wontfix"}, + {"title": "No entry 🚫", "label": "no RN"}, {"title": "Breaking Changes 💥", "label": "breaking-change"}, {"title": "New Features 🎉", "label": "enhancement"}, {"title": "New Features 🎉", "label": "feature"}, - {"title": "Bugfixes 🛠", "label": "bug"} + {"title": "Bugfixes 🛠", "label": "bug"}, + {"title": "Infrastructure ⚙️", "label": "infrastructure"}, + {"title": "Silent-live 🤫", "label": "silent-live"}, + {"title": "Documentation 📜", "label": "documentation"} ]' duplicity-scope: 'service' duplicity-icon: '🔁' diff --git a/.github/workflows/release_publish.yml b/.github/workflows/release_publish.yml index b349a8ff6..3a68d8cf2 100644 --- a/.github/workflows/release_publish.yml +++ b/.github/workflows/release_publish.yml @@ -27,6 +27,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 - uses: coursier/cache-action@v5 @@ -51,6 +52,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: + persist-credentials: false fetch-depth: 0 - uses: coursier/cache-action@v5 diff --git a/.github/workflows/test_filenames_check.yml b/.github/workflows/test_filenames_check.yml index d3e24ee2f..6e35228e2 100644 --- a/.github/workflows/test_filenames_check.yml +++ b/.github/workflows/test_filenames_check.yml @@ -27,7 +27,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + persist-credentials: false - name: Filename Inspector id: scan-test-files diff --git a/README.md b/README.md index 46dfc975b..2b1185990 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,19 @@ # Atum Service +[![Build](https://github.com/AbsaOSS/spark-commons/actions/workflows/build.yml/badge.svg)](https://github.com/AbsaOSS/spark-commons/actions/workflows/build.yml) +[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) + +| Atum Server | Atum Agent | Atum Model | Atum Reader | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [![GitHub release](https://img.shields.io/github/release/AbsaOSS/atum-service.svg)](https://GitHub.com/AbsaOSS/atum-service/releases/) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-agent-spark3_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-agent&namespace=za.co.absa.atum-service) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-model_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-model&namespace=za.co.absa.atum-service) | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa.atum-service/atum-reader_2.13/badge.svg)](https://central.sonatype.com/search?q=atum-reader&namespace=za.co.absa.atum-service) | + + + + - [Atum Service](#atum-service) + - [Motivation](#motivation) + - [Features](#features) - [Modules](#modules) - [Agent `agent/`](#agent-agent) - [Reader `reader/`](#agent-agent) @@ -15,6 +28,9 @@ - [Measurement](#measurement) - [Checkpoint](#checkpoint) - [Data Flow](#data-flow) + - [Usage](#usage) + - [Atum Agent routines](#atum-agent-routines) + - [Control measurement types](#control-measurement-types) - [How to generate Code coverage report](#how-to-generate-code-coverage-report) - [How to Run in IntelliJ](#how-to-run-in-intellij) - [How to Run Tests](#how-to-run-tests) @@ -41,6 +57,39 @@ functions and are stored on a single central place, in a relational database. Co checkpoints is not only helpful for complying with strict regulatory frameworks, but also helps during development and debugging of your Spark-based data processing. +## Motivation + +Big Data strategy for a company usually includes data gathering and ingestion processes. +That is the definition of how data from different systems operating inside a company +are gathered and stored for further analysis and reporting. An ingestion processes can involve +various transformations like: +* Converting between data formats (XML, CSV, etc.) +* Data type casting, for example converting XML strings to numeric values +* Joining reference tables. For example this can include enriching existing + data with additional information available through dictionary mappings. + This constitutes a common ETL (Extract, Transform and Load) process. + +During such transformations, sometimes data can get corrupted (e.g. during casting), records can +get added or lost. For instance, *outer joining* a table holding duplicate keys can result in records explosion. +And *inner joining* a table which has no matching keys for some records will result in loss of records. + +In regulated industries it is crucial to ensure data integrity and accuracy. For instance, in the banking industry +the BCBS set of regulations requires analysis and reporting to be based on data accuracy and integrity principles. +Thus it is critical at the ingestion stage to preserve the accuracy and integrity of the data gathered from a +source system. + +The purpose of Atum is to provide means of ensuring no critical fields have been modified during +the processing and no records are added or lost. To do this the library provides an ability +to calculate *hash sums* of explicitly specified columns. We call the set of hash sums at a given time +a *checkpoint* and each hash sum we call a *control measurement*. Checkpoints can be calculated anytime +between Spark transformations and actions. + +We assume the data for ETL are processed in a series of batch jobs. Let's call each data set for a given batch +job a *batch*. All checkpoints are calculated for a specific batch. + +## Features + +TBD ## Modules @@ -157,6 +206,31 @@ The journey of a dataset throughout various data transformations and pipelines. even if it involves multiple applications or ETL pipelines. +## Usage + +### Atum Agent routines + +TBD + +### Control measurement types + +The control measurement of a column is a hash sum. It can be calculated differently depending on the column's data type and +on business requirements. This table represents all currently supported measurement types: + +| Type | Description | +|------------------------------------|:--------------------------------------------------------------| +| AtumMeasure.RecordCount | Calculates the number of rows in the dataset | +| AtumMeasure.DistinctRecordCount | Calculates DISTINCT(COUNT(()) of the specified column | +| AtumMeasure.SumOfValuesOfColumn | Calculates SUM() of the specified column | +| AtumMeasure.AbsSumOfValuesOfColumn | Calculates SUM(ABS()) of the specified column | +| AtumMeasure.SumOfHashesOfColumn | Calculates SUM(CRC32()) of the specified column | +| Measure.UnknownMeasure | Custom measure where the data are provided by the application | + +[//]: # (| controlType.aggregatedTruncTotal | Calculates SUM(TRUNC()) of the specified column |) + +[//]: # (| controlType.absAggregatedTruncTotal | Calculates SUM(TRUNC(ABS())) of the specified column |) + + ## How to generate Code coverage report ```sbt sbt jacoco