-
Notifications
You must be signed in to change notification settings - Fork 1
156 lines (127 loc) · 5.25 KB
/
preprocess.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
name: Preprocessing
on:
pull_request_target:
# (re)opened PR or new commit in fork
types: [ opened, synchronize, reopened ]
paths:
- 'raw_data/**'
- 'processed_data/**'
jobs:
preprocess:
name: Preprocess raw data
# NOTE: on windows as computing of descriptors has a bug on linux right now
runs-on: windows-2019
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} # needed for pulling R packages from github
steps:
- name: Checkout fork repository
uses: actions/checkout@v3
with:
fetch-depth: 0
repository: ${{github.event.pull_request.head.repo.full_name}}
ref: ${{ github.head_ref }}
lfs: true
- name: Get changed files
id: files
uses: Ana06/[email protected]
- name: Get new/changed datasets
id: filesfolders
shell: bash {0}
run: echo "files=$(for f in ${{ steps.files.outputs.all }}; do basename $(dirname $f); done | grep -E '^[0-9]+$' | sort | uniq | tr '\n' ' ')" >> $GITHUB_OUTPUT
- name: List all added files
shell: bash {0}
run: |
for f in ${{ steps.filesfolders.outputs.files }}; do
ls -lh raw_data/$f
done
- name: Pyton dependencies
run: pip install -r scripts/Python/requirements.txt
- name: Setup java
uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '17'
- name: Set RENV_PATHS_ROOT
shell: bash
run: |
echo "RENV_PATHS_ROOT=${{ runner.temp }}/renv" >> $GITHUB_ENV
- name: Setup R
uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true
r-version: 4.2.2
- name: Restore Renv package cache
uses: actions/cache@v3
with:
path: ${{ env.RENV_PATHS_ROOT }}
key: ${{ runner.os }}-renv-${{ hashFiles('**/renv.lock') }}
restore-keys: |
${{ runner.os }}-renv-
- name: Activate renv
shell: Rscript {0}
run: |
options(renv.config.mran.enabled = FALSE)
renv::restore()
- name: Standardize compounds
run: Rscript scripts/R_ci/compounds_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds classyfire classes
run: Rscript scripts/R_ci/compounds_classyfire.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds descriptors
run: Rscript scripts/R_ci/compounds_descriptors.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds fingerprints
run: Rscript scripts/R_ci/compounds_fingerprints.R ${{ steps.filesfolders.outputs.files }}
- name: Metadata standardization
run: Rscript scripts/R_ci/metadata_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Generate dataset reports
run: Rscript scripts/R_ci/compounds_overview.R ${{ steps.filesfolders.outputs.files }}
- name: Verify that required files are present
run: Rscript scripts/R_ci/files_complete.R ${{ steps.filesfolders.outputs.files }}
- name: Update overview table of all datasets
run: python3 scripts/Python/datasets_overview.py
continue-on-error: true
- name: QSPR-based validation
run: python3 scripts/Python/validation_qspr.py ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets with nominally identical setups
run: python3 scripts/Python/validation_order.py --mode same_condition ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets of systematic measurements
run: python3 scripts/Python/validation_order.py --mode systematic ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Commit preprocessing
run: |
git config --global user.email '[email protected]'
git config --global user.name 'Github Actions'
# Use LFS storage of main repository: no push access to fork LFS storage
# TODO: change once repository is moved
git config lfs.url 'https://github.com/f-kretschmer/pr_test_repo.git/info/lfs'
git add processed_data raw_data
git commit -m "Preprocessing ${{ steps.filesfolders.outputs.files }}"
git lfs push origin HEAD # first push LFS, otherwise failure because of lfs.url
git push origin HEAD
- name: Add comment with report to PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: (await exec.getExecOutput('python3 scripts/Python/report.py', '${{ steps.filesfolders.outputs.files }}'.trim().split(' '))).stdout
})
continue-on-error: true
- name: Label as successfully preprocessed
if: ${{ success() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing successful"
remove-labels: "preprocessing failed"
- name: Debug with tmate on failure
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
- name: Label as failed
if: ${{ failure() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing failed"
remove-labels: "preprocessing successful"