diff --git a/scripts/finished_scripts/confusionMatrix.daph b/scripts/finished_scripts/confusionMatrix.daph new file mode 100644 index 000000000..6268e3574 --- /dev/null +++ b/scripts/finished_scripts/confusionMatrix.daph @@ -0,0 +1,68 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Computes the confusion matrix for input vectors of predictions +# and actual labels. We return both the counts and relative frequency +# (normalized by sum of true labels) +# +# .. code-block:: +# +# True Labels +# 1 2 +# 1 TP | FP +# Predictions ----+---- +# 2 FN | TN +# +# INPUT: +# ------------------------------------------------------------------------------ +# P vector of predictions (1-based, recoded) +# Y vector of actual labels (1-based, recoded) +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# confusionSum the confusion matrix as absolute counts +# confusionAvg the confusion matrix as relative frequencies +# ------------------------------------------------------------------------------ + +def m_confusionMatrix(P:matrix, Y:matrix) -> matrix, matrix { + dim = max(aggMax(Y), aggMax(P)); + + if (ncol(P) > 1 || ncol(Y) > 1) { + stop("confusionMatrix: Invalid input number of cols should be 1 in both P [" + as.si64(ncol(P)) + "] and Y [" + as.si64(ncol(Y)) + "]"); + } + + + if (nrow(P) != nrow(Y)) { + stop("confusionMatrix: The number of rows have to be equal in both P [" + as.si64(nrow(P)) + "] and Y [" + as.si64(nrow(Y)) + "]"); + } + + + if (aggMin(P) < 1 || aggMin(Y) < 1) { + stop("confusionMatrix: All Values in P and Y should be abore or equal to 1, min(P):" + aggMin(P) + " min(Y):" + aggMin(Y)); + } + + confusionSum = ctable(P, Y, dim, dim); + # max to avoid division by 0, in case a colum contain no entries. + confusionAvg = confusionSum / max(1, sum(confusionSum, 1)); + return as.matrix(confusionSum), as.matrix(confusionAvg); +} + diff --git a/scripts/finished_scripts/dist.daph b/scripts/finished_scripts/dist.daph new file mode 100644 index 000000000..b8f1a4bb7 --- /dev/null +++ b/scripts/finished_scripts/dist.daph @@ -0,0 +1,41 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Returns Euclidean distance matrix (distances between N n-dimensional points) +# +# INPUT: +# -------------------------------------------------------------------------------- +# X Matrix to calculate the distance inside +# -------------------------------------------------------------------------------- +# +# OUTPUT: +# ----------------------------------------------------------------------------------------------- +# Y Euclidean distance matrix +# ----------------------------------------------------------------------------------------------- + +def m_dist(X:matrix) -> matrix { + n = as.si64(nrow(X)); + s = sum(X ^ 2, 0); + Y = sqrt((-2.0) * X @ t(X) + s + t(s)); + Y = replace(Y, nan, 0); + return as.matrix(Y); +} + diff --git a/scripts/finished_scripts/getAccuracy.daph b/scripts/finished_scripts/getAccuracy.daph new file mode 100644 index 000000000..bf63db23e --- /dev/null +++ b/scripts/finished_scripts/getAccuracy.daph @@ -0,0 +1,55 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This builtin function compute the weighted and simple accuracy for given predictions +# +# INPUT: +# -------------------------------------------------------------------------------------- +# y Ground truth (Actual Labels) +# yhat Predictions (Predicted labels) +# isWeighted Flag for weighted or non-weighted accuracy calculation +# -------------------------------------------------------------------------------------- +# +# OUTPUT: +# -------------------------------------------------------------------------------------------- +# accuracy accuracy of the predicted labels +# -------------------------------------------------------------------------------------------- + +def m_getAccuracy(y:matrix, yhat:matrix, isWeighted:bool /*= false*/) -> f64 { + accuracy = 0.0; + if ((isWeighted == false)) { + sum = sum(y == yhat); + accuracy = (sum / as.si64(nrow(y))) * 100; + } else { + n = as.si64(nrow(y)); + classes = ctable(y, 1, aggMax(y), 1); + resp = fill(as.f64(0), as.si64(nrow(y)), as.si64(nrow(classes))); + resp = as.matrix(resp + t(seq(as.f64(1), as.si64(nrow(classes)), 1 <= as.si64(nrow(classes)) ? 1 : -1))); + respY = resp == y; + respYhat = resp == yhat; + pred = respY * respYhat; + classes = replace(classes, 0, 1); + accuracy = as.f64(mean(sum(pred, 1) / t(classes)) * 100); + } + + return as.f64(accuracy); +} + diff --git a/scripts/finished_scripts/normalize.daph b/scripts/finished_scripts/normalize.daph new file mode 100644 index 000000000..ce2efc483 --- /dev/null +++ b/scripts/finished_scripts/normalize.daph @@ -0,0 +1,48 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices +# of positive values, this normalization preserves the input sparsity. +# +# INPUT: +# --------------------------------------------------------------------------------------- +# X Input feature matrix of shape n-by-m +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------------------------------- +# Y Modified output feature matrix of shape n-by-m +# cmin Column minima of shape 1-by-m +# cmax Column maxima of shape 1-by-m +# --------------------------------------------------------------------------------------- + +import "normalizeApply.daph"; + +def m_normalize(X:matrix) -> matrix, matrix, matrix { + # compute feature ranges for transformations + cmin = aggMin(X, 1); + cmax = aggMax(X, 1); + + # normalize features to range [0,1] + Y = normalizeApply.m_normalizeApply(X, cmin, cmax); + + return Y, cmin, cmax; +} diff --git a/scripts/finished_scripts/normalizeApply.daph b/scripts/finished_scripts/normalizeApply.daph new file mode 100644 index 000000000..7b3ae41e9 --- /dev/null +++ b/scripts/finished_scripts/normalizeApply.daph @@ -0,0 +1,47 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Min-max normalization (a.k.a. min-max scaling) to range [0,1], given +# existing min-max ranges. For matrices of positive values, this normalization +# preserves the input sparsity. The validity of the provided min-max range +# and post-processing is under control of the caller. +# +# INPUT: +# ------------------------------------------------ +# X Input feature matrix of shape n-by-m +# cmin Column min of shape 1-by-m +# cmax Column max of shape 1-by-m +# ------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------ +# Y Modified output feature matrix of shape n-by-m +# ------------------------------------------------ + +def m_normalizeApply(X:matrix, cmin:matrix, cmax:matrix) -> matrix { + diff = (cmax - cmin); + # avoid division by zero and divide by 1 instead + diff = replace(diff, 0, 1); + # normalize features to given range ([0,1] if indeed min/max) + Y = (X - cmin) / diff; + return as.matrix(Y); +} + diff --git a/scripts/finished_scripts/raSelection.daph b/scripts/finished_scripts/raSelection.daph new file mode 100644 index 000000000..494d16e6a --- /dev/null +++ b/scripts/finished_scripts/raSelection.daph @@ -0,0 +1,6 @@ +def m_raSelection(X:matrix, col:si64, op:str, val:f64) -> matrix { + I = op == "==" ? X[, col - 1] == val : op == "!=" ? X[, col - 1] != val : op == "<" ? X[, col - 1] < val : op == ">" ? X[, col - 1] > val : op == "<=" ? X[, col - 1] <= val : X[, col - 1] >= val; + Y = X[[I, ]]; + return as.matrix(Y); +} + diff --git a/scripts/finished_scripts/scale.daph b/scripts/finished_scripts/scale.daph new file mode 100644 index 000000000..e6a3955af --- /dev/null +++ b/scripts/finished_scripts/scale.daph @@ -0,0 +1,74 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This function scales and center individual features in the input +# matrix (column wise.) using z-score to scale the values. +# The transformation is sometimes also called scale and shift, +# but it is shifted first and then subsequently scaled. +# +# The method is not resistant to inputs containing NaN nor overflows +# of doubles, but handle it by guaranteeing that no extra NaN values +# are introduced and columns that contain NaN will not be scaled or shifted. +# +# INPUT: +# -------------------------------------------------------------------------------------- +# X Input feature matrix +# center Indicates to center the feature matrix +# scale Indicates to scale the feature matrix according to z-score +# -------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------------- +# Out Output feature matrix scaled and shifted +# Centering The column means of the input, subtracted if Center was TRUE +# ScaleFactor The scaling of the values, to make each dimension have similar value ranges +# ------------------------------------------------------------------------------------------- + +def m_scale(X:matrix, center:bool /*= true*/, scale:bool /*= true*/) -> matrix, matrix, matrix { + # Allocate the Centering and ScaleFactor as empty matrices, + # to return something on the function call. + Centering = fill(as.f64(0), 1, 1); + ScaleFactor = fill(as.f64(0), 1, 1); + + if (center) { + Centering = mean(X, 1); + # Replace entries with Nan with 0 to avoid introducing more NaN values. + Centering = replace(Centering, nan, 0); + X = as.matrix(X - Centering); + } + + + if (scale) { + N = as.si64(nrow(X)); + ScaleFactor = sqrt(sum(X ^ 2, 1) / (N - 1)); + + # Replace entries in the scale factor that are 0 and NaN with 1. + # To avoid division by 0 or NaN, introducing NaN to the ouput. + ScaleFactor = replace(ScaleFactor, nan, 1); + ScaleFactor = replace(ScaleFactor, 0, 1); + X = as.matrix(X / ScaleFactor); + } + + # assign output to the returned value. + Out = X; + return as.matrix(Out), as.matrix(Centering), as.matrix(ScaleFactor); +} + diff --git a/scripts/finished_scripts/scaleApply.daph b/scripts/finished_scripts/scaleApply.daph new file mode 100644 index 000000000..78d518100 --- /dev/null +++ b/scripts/finished_scripts/scaleApply.daph @@ -0,0 +1,51 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This function scales and center individual features in the input matrix (column wise.) using the input matrices. +# +# INPUT: +# ------------------------------------------------------------------------------------------ +# X Input feature matrix +# Centering The column means to subtract from X (not done if empty) +# ScaleFactor The column scaling to multiply with X (not done if empty) +# ------------------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------------ +# Y Output feature matrix with K columns +# ------------------------------------------------------------------------------------ + +def m_scaleApply(X:matrix, Centering:matrix, ScaleFactor:matrix) -> matrix { + Y = [0.0]; + if (as.si64(nrow(Centering)) > 0 && as.si64(ncol(Centering)) > 0) { + Y = X - Centering; + } else { + Y = as.matrix(X); + } + + + if (as.si64(nrow(ScaleFactor)) > 0 && as.si64(ncol(ScaleFactor)) > 0) { + Y = as.matrix(Y / ScaleFactor); + } + + return as.matrix(Y); +} + diff --git a/scripts/finished_scripts/scaleMinMax.daph b/scripts/finished_scripts/scaleMinMax.daph new file mode 100644 index 000000000..3c5078b78 --- /dev/null +++ b/scripts/finished_scripts/scaleMinMax.daph @@ -0,0 +1,10 @@ +def m_scaleMinMax(X:matrix) -> matrix { + print("Deprecated scaleMinMax use normalize instead"); + cmin = aggMin(X, 1); + cmax = aggMax(X, 1); + diff = (cmax - cmin); + diff = replace(diff, 0, 1); + Y = (X - cmin) / diff; + return as.matrix(Y); +} + diff --git a/scripts/finished_scripts/shortestPath.daph b/scripts/finished_scripts/shortestPath.daph new file mode 100644 index 000000000..1814b8d30 --- /dev/null +++ b/scripts/finished_scripts/shortestPath.daph @@ -0,0 +1,99 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Computes the minimum distances (shortest-path) between a single source vertex and every other vertex in the graph. +# +# Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bilk, +# James C. Dehnert, Ikkan Horn, Naty Leiser and Grzegorz Czajkowski: +# Pregel: A System for Large-Scale Graph Processing, SIGMOD 2010 +# +# INPUT: +# ------------------------------------------------------------------------------------------- +# G adjacency matrix of the labeled graph: Such graph can be directed +# (G is symmetric) or undirected (G is not symmetric). +# The values of G can be 0/1 (just specifying whether the nodes +# are connected or not) or integer values (representing the weight +# of the edges or the distances between nodes, 0 if not connected). +# maxi Integer max number of iterations accepted (0 for FALSE, i.e. +# max number of iterations not defined) +# sourceNode node index to calculate the shortest paths to all other nodes. +# verbose flag for verbose debug output +# ------------------------------------------------------------------------------------------- +# +# OUTPUT: +# -------------------------------------------------------------------------------------- +# C Output matrix (double) of minimum distances (shortest-path) between +# vertices: The value of the ith row and the jth column of the output +# matrix is the minimum distance shortest-path from vertex i to vertex j. +# When the value of the minimum distance is infinity, the two nodes are +# not connected. +# -------------------------------------------------------------------------------------- + +def m_shortestPath(G:matrix, maxi:si64 /*= 0*/, sourceNode:si64, verbose:bool /*= false*/) -> matrix { + + if (verbose) { + print("SHORTEST PATH CALCULATION:"); + } + + + if (aggMin(G) < 0) { + stop("Shortest Path: All values in G must be positive."); + } + + + if (as.si64(ncol(G)) != as.si64(nrow(G))) { + stop("Shortest Path: input graph G must be a squared matrix."); + } + + # initialize the matrix of minimum distances with "infinity" values: + minDist = fill(as.f64(inf), as.si64(nrow(G)), 1); + + # update minimum distance from the sourceNode to itself to 0: + minDist[sourceNode - 1, 0] = as.matrix(0.0); + iter = 1; + diff = inf; + while (diff > 0 && (maxi == 0 || iter <= maxi)) { + # avoid densification of 'colMins(G + minDist)' via colMin-colMax transform + # (we exploit here that ^x with x!=0 is treated as sparse-safe and otherwise + # Inf or NaN and replaced with 0, which keeps large sparse graphs sparse) + Gp = (G + (G != 0) * minDist) ^ ((-1.0)); + Gp = replace(Gp, inf, 0); + Gp = replace(Gp, nan, 0); + u = min(t(1 / aggMax(Gp, 1)), minDist); + diff = sum(minDist != u); + minDist = as.matrix(u); + + if (verbose) { + print("Shortest Path:"); + print("iter = " + iter + ", diff = " + diff); + } + + iter = as.si64(iter + 1); + } + C = minDist; + + if (verbose) { + print("\nShortest Path:"); print(C); + } + + return as.matrix(C); +} + diff --git a/scripts/finished_scripts/sigmoid.daph b/scripts/finished_scripts/sigmoid.daph new file mode 100644 index 000000000..a1111b695 --- /dev/null +++ b/scripts/finished_scripts/sigmoid.daph @@ -0,0 +1,44 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# The Sigmoid function is a type of activation function, and also defined as a squashing function which limit the +# output to a range between 0 and 1, which will make these functions useful in the prediction of probabilities. +# +# INPUT: +# ------------------------------------------------------------------------------------------- +# X Matrix of feature vectors. +# ------------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------------ +# Y 1-column matrix of weights. +# ------------------------------------------------------------------------------------------ + +def m_sigmoid(X:matrix) -> matrix { + Y = 1 / (1 + exp((0.0 - X))); + return as.matrix(Y); +} + +def s_sigmoid(x:f64) -> f64 { + y = 1 / (1 + exp((0.0 - x))); + return as.f64(y); +} + diff --git a/scripts/finished_scripts/test_confusionMatrix.daph b/scripts/finished_scripts/test_confusionMatrix.daph new file mode 100644 index 000000000..745d0fd94 --- /dev/null +++ b/scripts/finished_scripts/test_confusionMatrix.daph @@ -0,0 +1,9 @@ +import "confusionMatrix.daph"; + +p = [1.0, 2.0, 3.0, 4.0]; +y = [1.0, 2.0, 4.0, 4.0]; + +sum, avg = confusionMatrix.m_confusionMatrix(p, y); + +print(sum); +print(avg); diff --git a/scripts/finished_scripts/test_dist.daph b/scripts/finished_scripts/test_dist.daph new file mode 100644 index 000000000..aba96954e --- /dev/null +++ b/scripts/finished_scripts/test_dist.daph @@ -0,0 +1,5 @@ +import "dist.daph"; + +X = [1.0, 2.0, 3.0, 4.0]; + +print(dist.m_dist(X)); diff --git a/scripts/finished_scripts/test_getAccuracy.daph b/scripts/finished_scripts/test_getAccuracy.daph new file mode 100644 index 000000000..8cdc4609f --- /dev/null +++ b/scripts/finished_scripts/test_getAccuracy.daph @@ -0,0 +1,6 @@ +import "getAccuracy.daph"; + +y = [1,7,3,9,4,8,2,6,9,2,4,7,1,2,5]; +y_hat = [1,4,8,2,4,1,6,7,0,4,7,2,3,7,5]; + +print(getAccuracy.m_getAccuracy(as.f64(y), as.f64(y_hat), as.bool(0))); diff --git a/scripts/finished_scripts/test_idx.daph b/scripts/finished_scripts/test_idx.daph new file mode 100644 index 000000000..8bc9bfea6 --- /dev/null +++ b/scripts/finished_scripts/test_idx.daph @@ -0,0 +1,8 @@ +import "idx.daph"; + +X = [1, 2, 3, 0, 2, -1](2, 3); +print(X); + +Y = idx.m_idx(as.f64(X)); + +print(Y); diff --git a/scripts/finished_scripts/test_normalize.daph b/scripts/finished_scripts/test_normalize.daph new file mode 100644 index 000000000..0e5254483 --- /dev/null +++ b/scripts/finished_scripts/test_normalize.daph @@ -0,0 +1,9 @@ +import "normalize.daph"; + +X = [0.0, 1.0, 2.0, 3.0]; + +Y, cmin, cmax = normalize.m_normalize(X); + +print(Y); +print(cmin); +print(cmax); diff --git a/scripts/finished_scripts/test_normalizeApply.daph b/scripts/finished_scripts/test_normalizeApply.daph new file mode 100644 index 000000000..4f27e64b2 --- /dev/null +++ b/scripts/finished_scripts/test_normalizeApply.daph @@ -0,0 +1,10 @@ +import "normalizeApply.daph"; + +X = [0.0, 1.0, 2.0, 3.0]; + +cmin = [0]; +cmax = [3]; + +Y = normalizeApply.m_normalizeApply(as.f64(X), as.f64(cmin), as.f64(cmax)); + +print(Y); diff --git a/scripts/finished_scripts/test_raSelection.daph b/scripts/finished_scripts/test_raSelection.daph new file mode 100644 index 000000000..73f9e9c81 --- /dev/null +++ b/scripts/finished_scripts/test_raSelection.daph @@ -0,0 +1,11 @@ +import "raSelection.daph"; + +X = [1, 0, 0, 1, 0, 0, 1, 0, 0]; +print(X); + +col = 0; +op = "=="; +val = 0.0; + +Y = raSelection.m_raSelection(as.f64(X), col, op, val); +print(Y); diff --git a/scripts/finished_scripts/test_scale.daph b/scripts/finished_scripts/test_scale.daph new file mode 100644 index 000000000..0605c05d7 --- /dev/null +++ b/scripts/finished_scripts/test_scale.daph @@ -0,0 +1,12 @@ +import "scale.daph"; + +X = [1, 1, 1, 2, 2, 2, 3, 3, 3](3, 3); + +center = true; +scale = true; + +out, centering, scaleFactor = scale.m_scale(as.f64(X), center, scale); + +print(out); +print(centering); +print(scaleFactor); diff --git a/scripts/finished_scripts/test_scaleApply.daph b/scripts/finished_scripts/test_scaleApply.daph new file mode 100644 index 000000000..ce26f6970 --- /dev/null +++ b/scripts/finished_scripts/test_scaleApply.daph @@ -0,0 +1,9 @@ +import "scaleApply.daph"; + +X = [1, 1, 1, 2, 2, 2, 3, 3, 3](3, 3); + +center = [2, 2, 2]; +scale = [1, 1, 1]; + +Y = scaleApply.m_scaleApply(as.f64(X), as.f64(center), as.f64(scale)); +print(Y); diff --git a/scripts/finished_scripts/test_scaleMinMax.daph b/scripts/finished_scripts/test_scaleMinMax.daph new file mode 100644 index 000000000..55cba5c1f --- /dev/null +++ b/scripts/finished_scripts/test_scaleMinMax.daph @@ -0,0 +1,7 @@ +import "scaleMinMax.daph"; + +X = [1, 1, 1, 2, 2, 2, 3, 3, 3](3, 3); + +X = scaleMinMax.m_scaleMinMax(as.f64(X)); + +print(X); diff --git a/scripts/finished_scripts/test_shortestPath.daph b/scripts/finished_scripts/test_shortestPath.daph new file mode 100644 index 000000000..74a0135cc --- /dev/null +++ b/scripts/finished_scripts/test_shortestPath.daph @@ -0,0 +1,14 @@ +import "shortestPath.daph"; + +G = [0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0](4, 4); + +sourceNode = 1; +maxi = 0; +verbose = true; + +result = shortestPath.m_shortestPath(G, maxi, sourceNode, verbose); + +print(G); diff --git a/scripts/finished_scripts/test_sigmoid.daph b/scripts/finished_scripts/test_sigmoid.daph new file mode 100644 index 000000000..e81826919 --- /dev/null +++ b/scripts/finished_scripts/test_sigmoid.daph @@ -0,0 +1,7 @@ +import "sigmoid.daph"; + +X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + +Y = sigmoid.m_sigmoid(as.f64(X)); + +print(Y); diff --git a/scripts/finished_scripts/test_toOneHot.daph b/scripts/finished_scripts/test_toOneHot.daph new file mode 100644 index 000000000..a098dff1a --- /dev/null +++ b/scripts/finished_scripts/test_toOneHot.daph @@ -0,0 +1,9 @@ +import "toOneHot.daph"; + +X = [1, 2, 3, 4, 5]; + +numclasses = 5; + +Y = toOneHot.m_toOneHot(as.f64(X), numclasses); + +print(Y); diff --git a/scripts/finished_scripts/toOneHot.daph b/scripts/finished_scripts/toOneHot.daph new file mode 100644 index 000000000..22cca2aaf --- /dev/null +++ b/scripts/finished_scripts/toOneHot.daph @@ -0,0 +1,44 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# The toOneHot-function encodes unordered categorical vector to multiple binary vectors. +# +# INPUT: +# ------------------------------------------------------------------------------------------ +# X Vector with N integer entries between 1 and numClasses +# numclasses Number of columns, must be be greater than or equal to largest value in X +# ------------------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------------ +# Y One-hot-encoded matrix with shape (N, numClasses) +# ------------------------------------------------------------------------------------ + +def m_toOneHot(X:matrix, numClasses:si64) -> matrix { + + if (numClasses < aggMax(X)) { + stop("numClasses must be >= largest value in X to prevent cropping"); + } + + Y = ctable(seq(as.f64(1), as.si64(nrow(X)), 1 <= as.si64(nrow(X)) ? 1 : -1), X, as.si64(nrow(X)), numClasses); + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/alsPredict.daph b/scripts/translated_scripts/alsPredict.daph new file mode 100644 index 000000000..5ae6a6ffc --- /dev/null +++ b/scripts/translated_scripts/alsPredict.daph @@ -0,0 +1,22 @@ +def m_alsPredict(userIDs:matrix, I:matrix, L:matrix, R:matrix) -> matrix { + n = as.si64(nrow(userIDs)); + X_user_max = aggMax(userIDs); + + if (X_user_max > as.si64(nrow(L))) { + print("Predictions cannot be provided. Maximum user-id exceeds the number of users."); + return [0.0]; + } + + + if (as.si64(ncol(L)) != as.si64(nrow(R))) { + print("Predictions cannot be provided. Number of columns of L don't match the number of columns of R."); + return [0.0]; + } + + P = ctable(seq(as.f64(1), n, 1 <= n ? 1 : -1), userIDs, n, as.si64(nrow(L))); + Usel = P @ L; + Isel = P @ I; + Y = (Isel == 0) * (Usel @ R); + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/bug.daph b/scripts/translated_scripts/bug.daph new file mode 100644 index 000000000..22db56f01 --- /dev/null +++ b/scripts/translated_scripts/bug.daph @@ -0,0 +1,203 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# +# Builtin function that implements the k-Means clustering algorithm +# +# INPUT: +# --------------------------------------------------------------------------------------- +# X The input Matrix to do KMeans on. +# k Number of centroids +# runs Number of runs (with different initial centroids) +# max_iter Maximum number of iterations per run +# eps Tolerance (epsilon) for WCSS change ratio +# is_verbose do not print per-iteration stats +# avg_sample_size_per_centroid Average number of records per centroid in data samples +# seed The seed used for initial sampling. If set to -1 +# random seeds are selected. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------- +# Y The mapping of records to centroids +# C The output matrix with the centroids +# --------------------------------------------------------------- + +def get_sample_maps(num_records:si64, num_samples:si64, approx_sample_size:si64, seed:si64) -> matrix, matrix, si64 { + sample_col_map = [0.0]; + if (approx_sample_size < num_records) { + sample_block_size = as.si64(approx_sample_size + round(10 * sqrt(approx_sample_size))); + num_rows = sample_block_size * num_samples; + sample_rec_ids = rand(sample_block_size, num_samples, 0.0, 1.0, 0.5, seed); + sample_rec_ids = round(log(sample_rec_ids, 2) / log(1.0 - as.f64(approx_sample_size) / as.f64(num_records), 2) + 0.5); + sample_rec_ids = cumSum(sample_rec_ids); + is_sample_rec_id_within_range = (sample_rec_ids <= num_records); + sample_rec_ids = as.f64(sample_rec_ids * is_sample_rec_id_within_range + (num_records + 1) * (1 - is_sample_rec_id_within_range)); + sample_rec_ids = reshape(as.f64(sample_rec_ids), num_rows, 1); + is_row_in_samples = reshape(as.f64(is_sample_rec_id_within_range), num_rows, 1); + sample_maps = ctable(seq(as.f64(1), num_rows, 1 <= num_rows ? 1 : -1), sample_rec_ids, num_rows, num_records); + sample_positions = (num_rows + 1) - as.f64(is_row_in_samples) * seq(as.f64(num_rows), 1, (-1.0)); + col_positions = round(0.5 + seq(as.f64(0), num_rows - 1, 1) / sample_block_size); + sample_col_map = ctable(sample_positions, col_positions - 1); + sample_col_map = as.matrix(sample_col_map[0:(num_rows), ]); + return as.matrix(sample_maps), as.matrix(sample_col_map), sample_block_size; + } + one_per_record = fill(as.f64(1), num_records, 1); + sample_block_size = as.si64(num_records); + sample_maps = fill(as.f64(0), (num_records * num_samples), num_records); + sample_col_map = fill(as.f64(0), (num_records * num_samples), num_samples); + for(i in 1:num_samples) { + sample_maps[(num_records*(i-1)+1) - 1:(num_records*i), ] = diagMatrix(one_per_record); + sample_col_map[(num_records*(i-1)+1) - 1:(num_records*i), i - 1] = one_per_record; + } + return as.matrix(sample_maps), as.matrix(sample_col_map), sample_block_size; +} + +def m_kmeans(X:matrix, k:si64 /*= 10*/, runs:si64 /*= 10*/, max_iter:si64 /*= 1000*/, eps:f64 /*= 0.000001*/, is_verbose:bool /*= false*/, avg_sample_size_per_centroid:si64 /*= 50*/, seed:si64 /*= (-1.0)*/) { + if (is_verbose) { + print("BEGIN K-MEANS SCRIPT"); + } + + num_records = as.si64(nrow(X)); + num_features = as.si64(ncol(X)); + num_centroids = k; + num_runs = runs; + + if (is_verbose) { + print("dim X="); print(as.si64(nrow(X))); print("x"); print(as.si64(ncol(X))); + } + + sumXsq = sum(X ^ 2); + + if (is_verbose) { + print("Taking data samples for initialization..."); + } + + sample_maps, samples_vs_runs_map, sample_block_size = get_sample_maps(num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed); + + is_row_in_samples = sum(sample_maps, 0); + X_samples = sample_maps @ X; + X_samples_sq_norms = sum(X_samples ^ 2, 0); + + if (is_verbose) { + print("Initializing the centroids for all runs..."); + } + + All_Centroids = fill(as.f64(0), num_runs * num_centroids, num_features); + + min_distances = is_row_in_samples; + + for(i in 1:num_centroids) { + min_distances_matrix_form = reshape(min_distances, sample_block_size, num_runs); + cdf_min_distances = cumSum(min_distances_matrix_form); + + lseed = seed == (-1.0) ? (-1.0) : as.f64(seed + i); + random_row = rand(1, num_runs, 0.0, 1.0, 1.0, lseed); # this is not equal to: rand(rows=1, cols=num_runs, seed=lseed) + threshold_matrix = random_row * cdf_min_distances[sample_block_size - 1, ]; + centroid_ids = t(sum(cdf_min_distances < threshold_matrix, 1)) + 1; + + centroid_placer = ctable(seq(as.f64(1), num_runs, 1 <= num_runs ? 1 : -1), sample_block_size * seq(as.f64(0), num_runs - 1, 0 <= num_runs - 1 ? 1 : -1) + centroid_ids, num_runs, sample_block_size * num_runs); + centroids = centroid_placer @ X_samples; + + centroid_placer = ctable(seq(as.f64(i), num_centroids * (num_runs - 1) + i, num_centroids), seq(as.f64(1), num_runs, 1), as.si64(nrow(All_Centroids)), num_runs); + All_Centroids = as.matrix(All_Centroids + centroid_placer @ centroids); + + distances = X_samples_sq_norms + samples_vs_runs_map @ sum(centroids ^ 2, 0) - 2 * sum(X_samples * (samples_vs_runs_map @ centroids), 0); + if (i == 1) { + min_distances == as.matrix(is_row_in_samples * distances); + } else { + min_distances == as.matrix(min(min_distances, distances)); + } + } + + # Perform K-Means iterations for all runs: + + termination_code = fill(as.f64(0), num_runs, 1); + final_wcss = fill(as.f64(0), num_runs, 1); + num_iterations = fill(as.f64(0), num_runs, 1); + + if (is_verbose) { + print("Performing k-means iterations for all runs..."); + } + + C = [1.0]; + + for(run_index in 1:num_runs) { + C = All_Centroids[(num_centroids*(run_index-1)+1) - 1 : (num_centroids*run_index), ]; + C_old = C; + iter_count = 0; + term_code = 0; + wcss = inf; + print(run_index); + + while (term_code == 0) { + print("X"); + print(typeOf(X)); + + print("C in loop"); + + # depending on whether you use t(C) or C you get different calculation results + print(typeOf(t(C))); + # print(typeOf(C)); + + C_sum = t(sum(C ^ 2, 0)); + print("C_sum"); + print(typeOf(C_sum)); + D = (-2.0) * (X @ t(C)) + C_sum; + minD = aggMin(D, 0); + wcss_old = wcss; + wcss = as.f64(sumXsq + sum(minD)); + + if (is_verbose) { + if (iter_count == 0) { + print("Run ", 0); print(run_index, 0); print(", At Start-Up: Centroid WCSS = ", 0); print(wcss); + } else { + print("Run "); print(run_index); print(", Iteration "); print(iter_count); print(": Centroid WCSS = "); print(wcss); print("; Centroid change (avg.sq.dist.) = "); print((sum((C - C_old) ^ 2) / num_centroids)); + } + + } + + P = D <= minD; + P = as.matrix(P / sum(P, 0)); + P_denom = sum(P, 1); + C_new = (t(P) @ X) / t(P_denom); + + iter_count = as.si64(iter_count + 1); + if (wcss_old - wcss < eps * wcss) { + term_code = as.si64(1); + } else { + if (iter_count >= max_iter) { + term_code = as.si64(2); + } else { + if (sum(P_denom <= 0) > 0) { + term_code = as.si64(3); + } else { + C_old = as.matrix(C); + C = as.matrix(C_new); + } + } + } + } + + } + + +} + diff --git a/scripts/translated_scripts/components.daph b/scripts/translated_scripts/components.daph new file mode 100644 index 000000000..15b79a21d --- /dev/null +++ b/scripts/translated_scripts/components.daph @@ -0,0 +1,25 @@ +def m_components(G:matrix, maxi:si64 /*= 0*/, verbose:bool /*= true*/) -> matrix { + + if (sum(sum(G, 0) != t(sum(G, 1))) > 0) { + print("Connected Components: input graph needs to be " + "symmetric but rowSums and colSums don't match up."); + return [0.0]; + } + + c = seq(as.f64(1), as.si64(nrow(G)), 1 <= as.si64(nrow(G)) ? 1 : -1); + diff = inf; + iter = 1; + while (diff > 0 && (maxi == 0 || iter <= maxi)) { + u = max(aggMax(G * t(c), 0), c); + diff = sum(c != u); + c = as.matrix(u); + + if (verbose) { + print("Connected components: iter = "); print(iter); print(", #diff = "); print(diff); + } + + iter = as.si64(iter + 1); + } + C = c; + return as.matrix(C); +} + diff --git a/scripts/translated_scripts/csplineCG.daph b/scripts/translated_scripts/csplineCG.daph new file mode 100644 index 000000000..2367989bf --- /dev/null +++ b/scripts/translated_scripts/csplineCG.daph @@ -0,0 +1,150 @@ +def truncCG(X:matrix, by:si64, dir:str) -> matrix { + r = as.si64(nrow(X)); + c = as.si64(ncol(X)); + + if (by > r) { + print("can't pop matrix more than number of rows"); + return [0.0]; + } + + Y = fill(as.f64(0.0), r - by, c); + + if (r != by) { + + if (dir == "up") { + Y[0:r-by, ] = X[1+by - 1:r, ]; + } else { + + if (dir == "down") { + Y[0:r-by, ] = as.matrix(X[0:r-by, ]); + } else { + print("truncCG unsupported direction " + dir); + return [0.0]; + } + + } + + } + + return as.matrix(Y); +} + +def CGSolver(A:matrix, b:matrix, max_iteration:si64, tolerance:f64) -> matrix { + n = as.si64(nrow(A)); + + if (max_iteration < 0) { + max_iteration = as.si64(n); + } + + + if (tolerance < 0) { + tolerance = as.f64(0.000001); + } + + x = fill(as.f64(0), n, 1); + i = as.si64(0); + r = (0.0 - t(A)) @ b; + p = (0.0 - r); + norm_r2 = sum(r ^ 2); + norm_r2_initial = norm_r2; + norm_r2_target = norm_r2_initial * tolerance ^ 2; + while (i < max_iteration && norm_r2 > norm_r2_target) { + q = as.f64(t(A) @ (A @ p)); + a = as.f64(norm_r2 / sum(p * q)); + x = as.matrix(x + a * p); + r = as.matrix(r + a * q); + old_norm_r2 = norm_r2; + norm_r2 = sum(r ^ 2); + p = as.matrix((0.0 - r) + (norm_r2 / old_norm_r2) * p); + i = as.si64(i + 1); + } + + if (i >= max_iteration) { + print("Warning: the maximum number of iterations has been reached."); + } + + return as.matrix(x); +} + +def resizeCG(X:matrix, rby:si64, cby:si64, dir:str) -> matrix { + r = as.si64(nrow(X)); + c = as.si64(ncol(X)); + rn = r + rby; + cn = c + cby; + Y = fill(as.f64(0.0), rn, cn); + + if (dir == "tr") { + Y[1+rby - 1:rn, 0:c] = X; + } else { + + if (dir == "bl") { + Y[0:r, 1+cby - 1:cn] = X; + } else { + + if (dir == "tl") { + Y[1+rby - 1:rn, 1+cby - 1:cn] = X; + } else { + + if (dir == "br") { + Y[0:r, 0:c] = X; + } else { + print("Unknown direction dir => " + dir); + return [0.0]; + } + + } + + } + + } + + return as.matrix(Y); +} + +def calcKnotsDerivKsCG(X:matrix, Y:matrix, max_iteration:si64, tolerance:f64) -> matrix { + nx = as.si64(nrow(X)); + ny = as.si64(nrow(Y)); + + if (nx != ny) { + print("X and Y vectors are of different size"); + return [0.0]; + } + + Xu = truncCG(as.matrix(X), as.si64(1), as.str("up")); + Xd = truncCG(as.matrix(X), as.si64(1), as.str("down")); + Bx = 1 / (Xu - Xd); + Bxd = resizeCG(as.matrix(Bx), as.si64(1), as.si64(0), as.str("tr")); + Bxu = resizeCG(as.matrix(Bx), as.si64(1), as.si64(0), as.str("br")); + Dx = 2 * (Bxd + Bxu); + MDx = diagMatrix(Dx); + MBx = diagMatrix(Bx); + MUx = resizeCG(as.matrix(MBx), as.si64(1), as.si64(1), as.str("bl")); + MLx = resizeCG(as.matrix(MBx), as.si64(1), as.si64(1), as.str("tr")); + A = MUx + MDx + MLx; + Yu = truncCG(as.matrix(Y), as.si64(1), as.str("up")); + Yd = truncCG(as.matrix(Y), as.si64(1), as.str("down")); + By = (Yu - Yd) / (Bx * Bx); + By1 = resizeCG(as.matrix(By), as.si64(1), as.si64(0), as.str("tr")); + By2 = resizeCG(as.matrix(By), as.si64(1), as.si64(0), as.str("br")); + b = 3 * (By1 + By2); + K = CGSolver(as.matrix(A), as.matrix(b), as.si64(max_iteration), as.f64(tolerance)); + return as.matrix(K); +} + +def interpSplineCG(x:f64, X:matrix, Y:matrix, K:matrix) -> f64 { + i = as.si64(as.si64(nrow(X)) - sum(X >= x) + 1); + t = (x - X[i-1 - 1, 0]) / (X[i - 1, 0] - X[i-1 - 1, 0]); + a = K[i-1 - 1, 0] * (X[i - 1, 0] - X[i-1 - 1, 0]) - (Y[i - 1, 0] - Y[i-1 - 1, 0]); + b = as.matrix((0.0 - K[i - 1, 0]) * (X[i - 1, 0] - X[i-1 - 1, 0]) + (Y[i - 1, 0] - Y[i-1 - 1, 0])); + qm = (1 - t) * Y[i-1 - 1, 0] + t * Y[i - 1, 0] + t * (1 - t) * (a * (1 - t) + b * t); + q = as.scalar(as.f64(qm)); + return as.f64(q); +} + +def m_csplineCG(X:matrix, Y:matrix, inp_x:f64, tol:f64 /*= 0.000001*/, maxi:si64 /*= 0*/) -> matrix, matrix { + K = calcKnotsDerivKsCG(as.matrix(X), as.matrix(Y), as.si64(maxi), as.f64(tol)); + y = interpSplineCG(as.f64(inp_x), as.matrix(X), as.matrix(Y), as.matrix(K)); + pred_Y = fill(as.f64(y), 1, 1); + return as.matrix(pred_Y), as.matrix(K); +} + diff --git a/scripts/translated_scripts/csplineDS.daph b/scripts/translated_scripts/csplineDS.daph new file mode 100644 index 000000000..dd01b6db4 --- /dev/null +++ b/scripts/translated_scripts/csplineDS.daph @@ -0,0 +1,113 @@ +def interpSpline(x:f64, X:matrix, Y:matrix, K:matrix) -> f64 { + i = as.si64(as.si64(nrow(X)) - sum(X >= x) + 1); + t = (x - X[i-1 - 1, 0]) / (X[i - 1, 0] - X[i-1 - 1, 0]); + a = K[i-1 - 1, 0] * (X[i - 1, 0] - X[i-1 - 1, 0]) - (Y[i - 1, 0] - Y[i-1 - 1, 0]); + b = as.matrix((0.0 - K[i - 1, 0]) * (X[i - 1, 0] - X[i-1 - 1, 0]) + (Y[i - 1, 0] - Y[i-1 - 1, 0])); + qm = (1 - t) * Y[i-1 - 1, 0] + t * Y[i - 1, 0] + t * (1 - t) * (a * (1 - t) + b * t); + q = as.scalar(as.f64(qm)); + return as.f64(q); +} + +def resize(X:matrix, rby:si64, cby:si64, dir:str) -> matrix { + r = as.si64(nrow(X)); + c = as.si64(ncol(X)); + rn = r + rby; + cn = c + cby; + Y = fill(as.f64(0.0), rn, cn); + + if (dir == "tr") { + Y[1+rby - 1:rn, 0:c] = X; + } else { + + if (dir == "bl") { + Y[0:r, 1+cby - 1:cn] = X; + } else { + + if (dir == "tl") { + Y[1+rby - 1:rn, 1+cby - 1:cn] = X; + } else { + + if (dir == "br") { + Y[0:r, 0:c] = X; + } else { + print("Unknown direction dir => " + dir); + return [0.0]; + } + + } + + } + + } + + return as.matrix(Y); +} + +def trunc(X:matrix, by:si64, dir:str) -> matrix { + r = as.si64(nrow(X)); + c = as.si64(ncol(X)); + + if (by > r) { + print("can't pop matrix more than number of rows"); + return [0.0]; + } + + Y = fill(as.f64(0.0), r - by, c); + + if (r != by) { + + if (dir == "up") { + Y[0:r-by, ] = X[1+by - 1:r, ]; + } else { + + if (dir == "down") { + Y[0:r-by, ] = as.matrix(X[0:r-by, ]); + } else { + print("trunc unsupported direction " + dir); + return [0.0]; + } + + } + + } + + return as.matrix(Y); +} + +def calcKnotsDerivKs(X:matrix, Y:matrix) -> matrix { + nx = as.si64(nrow(X)); + ny = as.si64(nrow(Y)); + + if (nx != ny) { + print("X and Y vectors are of different size"); + return [0.0]; + } + + Xu = trunc(as.matrix(X), as.si64(1), as.str("up")); + Xd = trunc(as.matrix(X), as.si64(1), as.str("down")); + Bx = 1 / (Xu - Xd); + Bxd = resize(as.matrix(Bx), as.si64(1), as.si64(0), as.str("tr")); + Bxu = resize(as.matrix(Bx), as.si64(1), as.si64(0), as.str("br")); + Dx = 2 * (Bxd + Bxu); + MDx = diagMatrix(Dx); + MBx = diagMatrix(Bx); + MUx = resize(as.matrix(MBx), as.si64(1), as.si64(1), as.str("bl")); + MLx = resize(as.matrix(MBx), as.si64(1), as.si64(1), as.str("tr")); + A = MUx + MDx + MLx; + Yu = trunc(as.matrix(Y), as.si64(1), as.str("up")); + Yd = trunc(as.matrix(Y), as.si64(1), as.str("down")); + By = (Yu - Yd) / (Bx * Bx); + By1 = resize(as.matrix(By), as.si64(1), as.si64(0), as.str("tr")); + By2 = resize(as.matrix(By), as.si64(1), as.si64(0), as.str("br")); + b = 3 * (By1 + By2); + K = solve(A, b); + return as.matrix(K); +} + +def m_csplineDS(X:matrix, Y:matrix, inp_x:f64) -> matrix, matrix { + K = calcKnotsDerivKs(as.matrix(X), as.matrix(Y)); + y = interpSpline(as.f64(inp_x), as.matrix(X), as.matrix(Y), as.matrix(K)); + pred_Y = fill(as.f64(y), 1, 1); + return as.matrix(pred_Y), as.matrix(K); +} + diff --git a/scripts/translated_scripts/hospitalResidencyMatch.daph b/scripts/translated_scripts/hospitalResidencyMatch.daph new file mode 100644 index 000000000..2b5b87301 --- /dev/null +++ b/scripts/translated_scripts/hospitalResidencyMatch.daph @@ -0,0 +1,99 @@ +def m_hospitalResidencyMatch(R:matrix, H:matrix, capacity:matrix, verbose:bool /*= false*/) -> matrix, matrix { + print("STARTING RESIDENCY MATCH ALGORITHM"); + print("READING R as residents AND H as Hospitals and capacity..."); + m = as.si64(nrow(R)); + n = as.si64(ncol(R)); + residencyMatch = fill(as.f64(0.0), m, n); + hospitalMatch = fill(as.f64(0.0), n, m); + resultmMatrix = fill(as.f64(0.0), as.si64(nrow(R)), as.si64(ncol(R))); + + if (as.si64(nrow(capacity)) != as.si64(nrow(H))) { + print("ERROR: Missing capacity info for some hospitals"); + } + + startM = fill(as.f64(1.0), m, 1); + hIndex = fill(as.f64(1.0), m, 1); + proposer_pointers = fill(as.f64(1.0), m, 1); + prev_Residents_vector = fill(as.f64(1.0), n, 1); + prevIndex_Residents_vector = fill(as.f64(1.0), n, 1); + prev_Residents_vector = aggMin(hospitalMatch, 0); + prevIndex_Residents_vector = idxMax(hospitalMatch, 0); + while (sum(startM) > 0) { + for(i in 1:m) { + + if (as.scalar(as.f64(startM[i - 1])) == 1) { + secondIndex = as.scalar(as.f64(proposer_pointers[i - 1])); + hIndex[i - 1] = as.scalar(as.f64(R[i - 1, secondIndex - 1])); + prev_Residents_vector = aggMax(hospitalMatch, 0); + prevIndex_Residents_vector = idxMax(hospitalMatch, 0); + + if (as.scalar(as.f64(hIndex[i - 1])) != 0) { + hosValue = as.scalar(as.f64(H[as.scalar(hIndex[i]) - 1, i - 1])); + + if (hosValue > 0) { + + if (as.scalar(as.f64(capacity[as.scalar(hIndex[i]) - 1, 0])) >= 1) { + capacity[as.scalar(hIndex[i]) - 1, 0] = as.scalar(as.f64(capacity[as.scalar(hIndex[i]) - 1, 0])) - 1; + residencyMatch[i - 1, as.scalar(hIndex[i]) - 1] = as.scalar(as.f64(proposer_pointers[i - 1])); + hospitalMatch[as.scalar(hIndex[i]) - 1, i - 1] = hosValue; + startM[i - 1] = as.matrix(0.0); + proposer_pointers[i - 1] = as.scalar(as.f64(proposer_pointers[i - 1])) + 1; + + if (as.scalar(as.f64(proposer_pointers[i - 1])) > n) { + proposer_pointers[i - 1] = as.matrix(n.0); + } + + } else { + + if (as.scalar(as.f64(prev_Residents_vector[as.scalar(hIndex[i]) - 1])) >= secondIndex) { + resPrev = as.scalar(as.f64(prevIndex_Residents_vector[as.scalar(hIndex[i]) - 1, 0])); + hospitalMatch[as.scalar(hIndex[i]) - 1, resPrev - 1] = as.matrix(0.0); + residencyMatch[resPrev - 1, as.scalar(hIndex[i]) - 1] = as.matrix(0.0); + hospitalMatch[as.scalar(hIndex[i]) - 1, i - 1] = as.scalar(as.f64(proposer_pointers[i - 1])); + residencyMatch[i - 1, as.scalar(hIndex[i]) - 1] = as.scalar(as.f64(proposer_pointers[i - 1])); + startM[i - 1] = as.matrix(0.0); + prevResIndex = as.scalar(as.f64(prevIndex_Residents_vector[as.scalar(hIndex[i]) - 1, 0])); + + if (prevResIndex > 0) { + startM[prevResIndex - 1] = as.matrix(1.0); + proposer_pointers[i - 1] = as.f64(as.scalar(as.f64(proposer_pointers[i - 1])) + 1); + + if (as.scalar(as.f64(proposer_pointers[i - 1])) > n) { + proposer_pointers[i - 1] = as.matrix(n.0); + } + + } + + } + + } + + } + + + if (as.scalar(as.f64(startM[i - 1])) == 1) { + proposer_pointers[i - 1] = as.f64(as.scalar(as.f64(proposer_pointers[i - 1])) + 1); + + if (as.scalar(as.f64(proposer_pointers[i - 1])) > n) { + proposer_pointers[i - 1] = as.matrix(n.0); + } + + } + + } + + } + + } + } + + if (verbose) { + print("residencyMatch"); + print(residencyMatch); + print("hospitalMatch"); + print(hospitalMatch); + } + + return as.matrix(residencyMatch), as.matrix(hospitalMatch); +} + diff --git a/scripts/translated_scripts/img_brightness.daph b/scripts/translated_scripts/img_brightness.daph new file mode 100644 index 000000000..64e4969ca --- /dev/null +++ b/scripts/translated_scripts/img_brightness.daph @@ -0,0 +1,5 @@ +def m_img_brightness(img_in:matrix, value:f64, channel_max:si64) -> matrix { + img_out = max(0, min(img_in + value, channel_max)); + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_crop.daph b/scripts/translated_scripts/img_crop.daph new file mode 100644 index 000000000..f5f93878a --- /dev/null +++ b/scripts/translated_scripts/img_crop.daph @@ -0,0 +1,22 @@ +def m_img_crop(img_in:matrix, w:si64, h:si64, x_offset:si64, y_offset:si64) -> matrix { + orig_w = as.si64(ncol(img_in)); + orig_h = as.si64(nrow(img_in)); + start_h = (ceil((orig_h - h) / 2)) + y_offset; + end_h = (start_h + h - 1); + start_w = (ceil((orig_w - w) / 2)) + x_offset; + end_w = (start_w + w - 1); + img_out = [0.0]; + if ((start_h < 0) || (end_h > orig_h) || (start_w < 0) || (end_w > orig_w)) { + print("Offset out of bounds! Returning input."); + img_out = img_in; + } else { + mask = fill(as.f64(0), orig_h, orig_w); + temp_mask = fill(as.f64(1), h, w); + mask[start_h - 1:end_h, start_w - 1:end_w] = temp_mask; + mask = reshape(mask, 1, orig_w * orig_h); + img_out = reshape((reshape(img_in + 1, 1, orig_w * orig_h))[[, mask]] - 1, h, w); + } + + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_crop_linearized.daph b/scripts/translated_scripts/img_crop_linearized.daph new file mode 100644 index 000000000..0e9d2de86 --- /dev/null +++ b/scripts/translated_scripts/img_crop_linearized.daph @@ -0,0 +1,23 @@ +def m_img_crop_linearized(img_in:matrix, w:si64, h:si64, x_offset:si64, y_offset:si64, s_cols:si64, s_rows:si64) -> matrix { + orig_w = s_cols; + orig_h = s_rows; + nrows = as.si64(nrow(img_in)); + start_h = (ceil((orig_h - h) / 2)) + y_offset; + end_h = (start_h + h - 1); + start_w = (ceil((orig_w - w) / 2)) + x_offset; + end_w = (start_w + w - 1); + img_out = [0.0]; + if ((start_h < 0) || (end_h > orig_h) || (start_w < 0) || (end_w > orig_w)) { + print("Offset out of bounds! Returning input."); + img_out = img_in; + } else { + mask = fill(as.f64(0), orig_h, orig_w); + temp_mask = fill(as.f64(1), h, w); + mask[start_h - 1:end_h, start_w - 1:end_w] = temp_mask; + linear_mask = reshape(mask, 1, orig_w * orig_h); + img_out = reshape((reshape(img_in + 1, as.si64(nrow(img_in)), as.si64(ncol(img_in))))[[, linear_mask]] - 1, nrows, w * h); + } + + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_cutout.daph b/scripts/translated_scripts/img_cutout.daph new file mode 100644 index 000000000..da98a87d8 --- /dev/null +++ b/scripts/translated_scripts/img_cutout.daph @@ -0,0 +1,21 @@ +def m_img_cutout(img_in:matrix, x:si64, y:si64, width:si64, height:si64, fill_value:f64) -> matrix { + rows = as.si64(nrow(img_in)); + cols = as.si64(ncol(img_in)); + img_out = [0.0]; + if (width < 1 || height < 1) { + print("Invalid width or height. Returning input"); + img_out = img_in; + } else { + end_x = x + width - 1; + end_y = y + height - 1; + start_x = max(1, x); + start_y = max(1, y); + end_x = min(cols, end_x); + end_y = min(rows, end_y); + img_out = reshape(img_in, rows, cols); + img_out[start_y - 1:end_y, start_x - 1:end_x] = fill(as.f64(fill_value), (end_y - start_y + 1), (end_x - start_x + 1)); + } + + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_invert.daph b/scripts/translated_scripts/img_invert.daph new file mode 100644 index 000000000..78e5a13a2 --- /dev/null +++ b/scripts/translated_scripts/img_invert.daph @@ -0,0 +1,5 @@ +def m_img_invert(img_in:matrix, max_value:f64) -> matrix { + img_out = max_value - img_in; + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_posterize.daph b/scripts/translated_scripts/img_posterize.daph new file mode 100644 index 000000000..25cf7ca12 --- /dev/null +++ b/scripts/translated_scripts/img_posterize.daph @@ -0,0 +1,5 @@ +def m_img_posterize(img_in:matrix, bits:si64) -> matrix { + img_out = (floor(img_in / 2 ^ (8 - bits))) * (2 ^ (8 - bits)); + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_posterize_linearized.daph b/scripts/translated_scripts/img_posterize_linearized.daph new file mode 100644 index 000000000..c40d828aa --- /dev/null +++ b/scripts/translated_scripts/img_posterize_linearized.daph @@ -0,0 +1,5 @@ +def m_img_posterize_linearized(img_in:matrix, bits:si64) -> matrix { + img_out = (floor(img_in / 2 ^ (8 - bits))) * (2 ^ (8 - bits)); + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_sample_pairing.daph b/scripts/translated_scripts/img_sample_pairing.daph new file mode 100644 index 000000000..5abf38f13 --- /dev/null +++ b/scripts/translated_scripts/img_sample_pairing.daph @@ -0,0 +1,11 @@ +def m_img_sample_pairing(img_in1:matrix, img_in2:matrix, weight:f64) -> matrix { + + if (weight < 0 || 1 < weight) { + print("Invalid weight. Set weight to 0.5"); + weight = as.f64(0.5); + } + + img_out = (1 - weight) * img_in1 + weight * img_in2; + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_sample_pairing_linearized.daph b/scripts/translated_scripts/img_sample_pairing_linearized.daph new file mode 100644 index 000000000..fd81daeb1 --- /dev/null +++ b/scripts/translated_scripts/img_sample_pairing_linearized.daph @@ -0,0 +1,15 @@ +def m_img_sample_pairing_linearized(img_in1:matrix, img_in2:matrix, weight:f64) -> matrix { + + if (weight < 0 || 1 < weight) { + print("Invalid weight. Set weight to 0.5"); + weight = as.f64(0.5); + } + + num_images = as.si64(nrow(img_in1)); + img_out = fill(as.f64(0), as.si64(nrow(img_in1)), as.si64(ncol(img_in2))); + for(i in 1:num_images) { + img_out[i - 1, ] = (1 - weight) * img_in1[i - 1, ] + weight * img_in2; + } + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/img_translate.daph b/scripts/translated_scripts/img_translate.daph new file mode 100644 index 000000000..6202de666 --- /dev/null +++ b/scripts/translated_scripts/img_translate.daph @@ -0,0 +1,48 @@ +def m_img_translate(img_in:matrix, offset_x:f64, offset_y:f64, out_w:si64, out_h:si64, fill_value:f64) -> matrix { + w = as.si64(ncol(img_in)); + h = as.si64(nrow(img_in)); + offset_x = round(offset_x); + offset_y = round(offset_y); + start_x = 1 - offset_x; + start_y = 1 - offset_y; + end_x = max(w, out_w) - offset_x; + end_y = max(h, out_h) - offset_y; + + if (start_x < 1) { + start_x = as.f64(1); + } + + + if (start_y < 1) { + start_y = as.f64(1); + } + + + if (w < end_x) { + end_x = as.f64(w); + } + + + if (h < end_y) { + end_y = as.f64(h); + } + + + if (out_w < end_x + offset_x) { + end_x = as.f64(out_w - offset_x); + } + + + if (out_h < end_y + offset_y) { + end_y = as.f64(out_h - offset_y); + } + + img_out = fill(as.f64(fill_value), out_h, out_w); + + if (start_x < end_x && start_y < end_y) { + img_out[(start_y+offset_y) - 1:(end_y+offset_y), (start_x+offset_x) - 1:(end_x+offset_x)] = img_in[start_y - 1:end_y, start_x - 1:end_x]; + } + + return as.matrix(img_out); +} + diff --git a/scripts/translated_scripts/kmeansPredict.daph b/scripts/translated_scripts/kmeansPredict.daph new file mode 100644 index 000000000..9fa103f59 --- /dev/null +++ b/scripts/translated_scripts/kmeansPredict.daph @@ -0,0 +1,7 @@ +def m_kmeansPredict(X:matrix, C:matrix) -> matrix { + D = (-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0)); + P = (D <= aggMin(D, 0)); + Y = idxMax(P, 0); + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/kmeans_final.daph b/scripts/translated_scripts/kmeans_final.daph new file mode 100644 index 000000000..f9a8426d4 --- /dev/null +++ b/scripts/translated_scripts/kmeans_final.daph @@ -0,0 +1,249 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# +# Builtin function that implements the k-Means clustering algorithm +# +# INPUT: +# --------------------------------------------------------------------------------------- +# X The input Matrix to do KMeans on. +# k Number of centroids +# runs Number of runs (with different initial centroids) +# max_iter Maximum number of iterations per run +# eps Tolerance (epsilon) for WCSS change ratio +# is_verbose do not print per-iteration stats +# avg_sample_size_per_centroid Average number of records per centroid in data samples +# seed The seed used for initial sampling. If set to -1 +# random seeds are selected. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------- +# Y The mapping of records to centroids +# C The output matrix with the centroids +# --------------------------------------------------------------- + +def get_sample_maps(num_records:si64, num_samples:si64, approx_sample_size:si64, seed:si64) -> matrix, matrix, si64 { + sample_col_map = [0.0]; + if (approx_sample_size < num_records) { + sample_block_size = as.si64(approx_sample_size + round(10 * sqrt(approx_sample_size))); + num_rows = sample_block_size * num_samples; + + sample_rec_ids = rand(sample_block_size, num_samples, 0.001, 1.0, 1.0, seed); + sample_rec_ids = round(log(sample_rec_ids, 2) / log(1.0 - approx_sample_size / num_records, 2) + 0.5); + sample_rec_ids = cumSum(sample_rec_ids); + + is_sample_rec_id_within_range = sample_rec_ids <= num_records; + sample_rec_ids = as.f64(sample_rec_ids * is_sample_rec_id_within_range + (num_records + 1) * (1 - is_sample_rec_id_within_range)); + + sample_rec_ids = reshape(t(as.f64(sample_rec_ids)), num_rows, 1); + is_row_in_samples = reshape(t(as.f64(is_sample_rec_id_within_range)), num_rows, 1); + + sample_maps = ctable(seq(as.f64(0), num_rows - 1), sample_rec_ids, num_rows, num_records); + + sample_positions = (num_rows + 1) - as.f64(is_row_in_samples) * seq(as.f64(num_rows), 1, (-1.0)); + col_positions = round(0.5 + seq(as.f64(0), num_rows - 1, 1) / sample_block_size); + sample_col_map = ctable(sample_positions - 1, col_positions - 1); + sample_col_map = as.matrix(sample_col_map[0:(num_rows), ]); + return as.matrix(sample_maps), as.matrix(sample_col_map), sample_block_size; + } + one_per_record = fill(as.f64(1), num_records, 1); + sample_block_size = as.si64(num_records); + sample_maps = fill(as.f64(0), (num_records * num_samples), num_records); + sample_col_map = fill(as.f64(0), (num_records * num_samples), num_samples); + for(i in 1:num_samples) { + sample_maps[(num_records*(i-1)+1) - 1:(num_records*i), ] = diagMatrix(one_per_record); + sample_col_map[(num_records*(i-1)+1) - 1:(num_records*i), i - 1] = one_per_record; + } + return as.matrix(sample_maps), as.matrix(sample_col_map), sample_block_size; +} + +def m_kmeans(X:matrix, k:si64 /*= 10*/, runs:si64 /*= 10*/, max_iter:si64 /*= 1000*/, eps:f64 /*= 0.000001*/, is_verbose:bool /*= false*/, avg_sample_size_per_centroid:si64 /*= 50*/, seed:si64 /*= (-1.0)*/) -> matrix, matrix { + if (is_verbose) { + print("BEGIN K-MEANS SCRIPT"); + } + + num_records = as.si64(nrow(X)); + num_features = as.si64(ncol(X)); + num_centroids = k; + num_runs = runs; + + if (is_verbose) { + print("dim X="+as.si64(nrow(X))+"x"+as.si64(ncol(X))); + } + + sumXsq = sum(X ^ 2); + + if (is_verbose) { + print("Taking data samples for initialization..."); + } + + sample_maps, samples_vs_runs_map, sample_block_size = get_sample_maps(num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed); + + is_row_in_samples = sum(sample_maps, 0); + X_samples = sample_maps @ X; + X_samples_sq_norms = sum(X_samples ^ 2, 0); + + if (is_verbose) { + print("Initializing the centroids for all runs..."); + } + + All_Centroids = fill(as.f64(0), num_runs * num_centroids, num_features); + min_distances = is_row_in_samples; + + for(i in 1:num_centroids) { + min_distances_matrix_form = reshape(min_distances, sample_block_size, num_runs); + cdf_min_distances = cumSum(min_distances_matrix_form); + + lseed = seed == (-1.0) ? (-1.0) : as.f64(seed + i); + random_row = rand(1, num_runs, 0.001, 1.0, 1.0, lseed); # this is not equal to: rand(rows=1, cols=num_runs, seed=lseed) + threshold_matrix = random_row * cdf_min_distances[sample_block_size - 1, ]; + centroid_ids = t(sum(cdf_min_distances < threshold_matrix, 1)) + 1; + + centroid_placer = ctable(seq(as.f64(1), num_runs), sample_block_size * seq(as.f64(0), num_runs - 1) + centroid_ids, num_runs, sample_block_size * num_runs); + print(typeOf(centroid_placer)); + centroids = centroid_placer @ X_samples; + + centroid_placer = ctable(seq(as.f64(i), num_centroids * (num_runs - 1) + i, num_centroids), seq(as.f64(1), num_runs, 1), as.si64(nrow(All_Centroids)), num_runs); + All_Centroids = as.matrix(All_Centroids + centroid_placer @ centroids); + + distances = X_samples_sq_norms + samples_vs_runs_map @ sum(centroids ^ 2, 0) - 2 * sum(X_samples * (samples_vs_runs_map @ centroids), 0); + if (i == 1) { + min_distances = as.matrix(is_row_in_samples * distances); + } else { + min_distances = as.matrix(min(min_distances, distances)); + } + } + + # Perform K-Means iterations for all runs: + + termination_code = fill(as.f64(0), num_runs, 1); + final_wcss = fill(as.f64(0), num_runs, 1); + num_iterations = fill(as.f64(0), num_runs, 1); + + if (is_verbose) { + print("Performing k-means iterations for all runs..."); + } + + C = [0.0]; + + for(run_index in 1:num_runs) { + C = All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ]; + C_old = C; + iter_count = 0; + term_code = 0; + wcss = inf; + + while (term_code == 0) { + D = (-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0)); + minD = aggMin(D, 0); + wcss_old = wcss; + wcss = sumXsq + sum(minD); + + if (is_verbose) { + if (iter_count == 0) { + print("Run "+run_index+", At Start-Up: Centroid WCSS = "+wcss); + } else { + print("Run "+run_index+", Iteration "+iter_count+": Centroid WCSS = "+wcss+"; Centroid change (avg.sq.dist.) = "+(sum((C - C_old) ^ 2) / num_centroids)); + } + + } + + P = D <= minD; + P = as.matrix(P / sum(P, 0)); # this might not work as expected and needs to be tested + P_denom = sum(P, 1); + C = (t(P) @ X) / t(P_denom); + + iter_count = as.si64(iter_count + 1); + if (wcss_old - wcss < eps * wcss) { + term_code = as.si64(1); + } else { + if (iter_count >= max_iter) { + term_code = as.si64(2); + } else { + if (sum(P_denom <= 0) > 0) { + term_code = as.si64(3); + } else { + C_old = C; + } + } + } + } + + if (is_verbose) { + print("Run "+run_index+", Iteration "+iter_count+": Terminated with code = "+term_code+", Centroid WCSS = "+wcss); + } + + + All_Centroids[(num_centroids*(run_index-1)+1) - 1:(num_centroids*run_index), ] = C; + final_wcss[run_index - 1, 0] = [wcss]; + termination_code[run_index - 1, 0] = as.matrix(term_code); + num_iterations[run_index - 1, 0] = as.matrix(iter_count); + } + + # Select the run with the best Centroid-WCSS and output its centroids + + termination_bitmap = fill(as.f64(0), num_runs, 3); + termination_bitmap_raw = ctable(seq(as.f64(0), num_runs - 1, 1), termination_code - 1); + termination_bitmap[, 0:ncol(termination_bitmap_raw)] = termination_bitmap_raw; + termination_stats = sum(termination_bitmap, 1); + + if (is_verbose) { + print("Number of successful runs = "+as.si64(as.scalar(as.f64(termination_stats[0, 0])))); + print("Number of incomplete runs = "+as.si64(as.scalar(as.f64(termination_stats[0, 1])))); + print("Number of failed runs (with lost centroids) = "+as.si64(as.scalar(as.f64(termination_stats[0, 2])))); + } + + num_successful_runs = as.scalar(as.f64(termination_stats[0, 0])); + + Y = [1.0]; + + if (num_successful_runs > 0) { + final_wcss_successful = replace(final_wcss, inf, 0) * termination_bitmap[, 0]; + worst_wcss = aggMax(final_wcss_successful); + best_wcss = aggMin(final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap[, 0])); + avg_wcss = sum(final_wcss_successful) / num_successful_runs; + best_index_vector = (final_wcss_successful == best_wcss); + aggr_best_index_vector = cumSum(best_index_vector); + best_index = as.si64(sum(aggr_best_index_vector == 0) + 1); + + if (is_verbose) { + print("Successful runs: Best run is "+best_index+" with Centroid WCSS = "+best_wcss+"; Avg WCSS = "+avg_wcss+"; Worst WCSS = "+worst_wcss); + } + + C = as.matrix(All_Centroids[(num_centroids*(best_index-1)+1) - 1:(num_centroids*best_index), ]); + D = as.matrix((-2.0) * (X @ t(C)) + t(sum(C ^ 2, 0))); + P = as.matrix((D <= aggMin(D, 0))); + aggr_P = t(cumSum(t(P))); + Y = sum(aggr_P == 0, 0) + 1; + + if (is_verbose) { + print("dim C= "+as.si64(nrow(C))+"x"+as.si64(ncol(C))+", dim Y= "+as.si64(nrow(Y))+"x"+as.si64(ncol(Y))); + } + + } else { + print("K-means: No output is produced. Try increasing the number of iterations and/or lower eps."); + C = fill(as.f64(0), num_centroids, num_records); + Y = fill(as.f64((-1.0)), 1, num_records); + } + + return as.matrix(C), as.matrix(Y); +} + diff --git a/scripts/translated_scripts/l2svm.daph b/scripts/translated_scripts/l2svm.daph new file mode 100644 index 000000000..dbf4db654 --- /dev/null +++ b/scripts/translated_scripts/l2svm.daph @@ -0,0 +1,99 @@ +def m_l2svm(X:matrix, Y:matrix, intercept:bool /*= false*/, epsilon:f64 /*= 0.00100000000000000002*/, reg:f64 /*= 1*/, maxIterations:si64 /*= 100*/, maxii:si64 /*= 20*/, verbose:bool /*= false*/, columnId:si64 /*= (-1.0)*/) -> matrix { + + if (as.si64(nrow(X)) < 2) { + stop("L2SVM: Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows"); + } + + + if (epsilon < 0) { + stop("L2SVM: Stopping due to invalid argument: Tolerance (tol) must be non-negative"); + } + + + if (reg < 0) { + stop("L2SVM: Stopping due to invalid argument: Regularization constant (reg) must be non-negative"); + } + + + if (maxIterations < 1) { + stop("L2SVM: Stopping due to invalid argument: Maximum iterations should be a positive integer"); + } + + + if (as.si64(ncol(Y)) < 1) { + stop("L2SVM: Stopping due to invalid multiple label columns, maybe use MSVM instead?"); + } + + check_min = aggMin(Y); + check_max = aggMax(Y); + num_min = sum(Y == check_min); + num_max = sum(Y == check_max); + + if (num_min + num_max != as.si64(nrow(Y))) { + print("L2SVM: WARNING invalid number of labels in Y: "); print(num_min); print(" "); print(num_max); + } + + + if (check_min != (-1.0) || check_max != (1.0)) { + Y = as.matrix(2 / (check_max - check_min) * Y - (check_min + check_max) / (check_max - check_min)); + } + + + if (verbose && columnId == (-1.0)) { + print('Running L2-SVM '); + } + + + if (intercept) { + ones = fill(as.f64(1), as.si64(nrow(X)), 1); + X = cbind(X, ones); + } + + w = fill(as.f64(0), as.si64(ncol(X)), 1); + Xw = fill(as.f64(0), as.si64(nrow(X)), 1); + g_old = t(X) @ Y; + s = g_old; + iter = 0; + continue = true; + while (continue && iter < maxIterations) { + step_sz = 0; + Xd = X @ s; + wd = reg * sum(w * s); + dd = reg * sum(s * s); + continue1 = true; + iiter = 0; + while (continue1 && iiter < maxii) { + tmp_Xw = Xw + step_sz * Xd; + out = 1 - Y * (tmp_Xw); + sv = (out > 0); + out = as.matrix(out * sv); + g = wd + step_sz * dd - sum(out * Y * Xd); + h = dd + sum(Xd * sv * Xd); + step_sz = as.si64(step_sz - g / h); + continue1 = as.bool((g * g / h >= epsilon)); + iiter = as.si64(iiter + 1); + } + w = as.matrix(w + step_sz * s); + Xw = as.matrix(Xw + step_sz * Xd); + out = as.matrix(1 - Y * Xw); + sv = as.matrix((out > 0)); + out = as.matrix(sv * out); + obj = 0.5 * sum(out * out) + reg / 2 * sum(w * w); + g_new = t(X) @ (out * Y) - reg * w; + + if (verbose) { + colstr = columnId != (-1.0) ? "-- MSVM class=" + columnId + ": " : ""; + print(colstr); print("Iter: "); print(iter); print(" InnerIter: "); print(iiter); print(" --- "); print(" Obj:"); print(obj); + } + + tmp = sum(s * g_old); + continue = as.bool((step_sz * tmp >= epsilon * obj && sum(s ^ 2) != 0)); + be = sum(g_new * g_new) / sum(g_old * g_old); + s = as.matrix(be * s + g_new); + g_old = as.matrix(g_new); + iter = as.si64(iter + 1); + } + model = w; + return as.matrix(model); +} + diff --git a/scripts/translated_scripts/l2svmPredict.daph b/scripts/translated_scripts/l2svmPredict.daph new file mode 100644 index 000000000..6f41f6cd2 --- /dev/null +++ b/scripts/translated_scripts/l2svmPredict.daph @@ -0,0 +1,21 @@ +def m_l2svmPredict(X:matrix, W:matrix, verbose:bool /*= false*/) -> matrix, matrix { + n = as.si64(nrow(X)); + m = as.si64(ncol(X)); + wn = as.si64(nrow(W)); + wm = as.si64(ncol(W)); + + if (m != wn) { + + if (m + 1 != wn) { + stop("l2svm Predict: Invalid shape of W [" + wm + "," + wn + "] or X [" + m + "," + n + "]"); + } + + YRaw = X @ W[0:m, ] + W[m+1 - 1, ]; + } else { + YRaw = as.matrix(X @ W); + } + + Y = idxMax(YRaw, 0); + return as.matrix(YRaw), as.matrix(Y); +} + diff --git a/scripts/translated_scripts/lmCG.daph b/scripts/translated_scripts/lmCG.daph new file mode 100644 index 000000000..2932332c0 --- /dev/null +++ b/scripts/translated_scripts/lmCG.daph @@ -0,0 +1,109 @@ +def m_lmCG(X:matrix, y:matrix, icpt:si64 /*= 0*/, reg:f64 /*= 0.0000001*/, tol:f64 /*= 0.0000001*/, maxi:si64 /*= 0*/, verbose:bool /*= true*/) -> matrix { + intercept_status = icpt; + regularization = reg; + tolerance = tol; + max_iteration = maxi; + n = as.si64(nrow(X)); + m = as.si64(ncol(X)); + scale_lambda = [0.0]; + m_ext = 0; + if (intercept_status == 1 || intercept_status == 2) { + ones_n = fill(as.f64(1), n, 1); + X = cbind(X, ones_n); + m_ext = as.si64(ncol(X)); + scale_lambda = fill(as.f64(1), m_ext, 1); + scale_lambda[m_ext - 1, 0] = as.matrix(0.0); + } else { + scale_lambda = fill(as.f64(1), m, 1); + m_ext = as.si64(m); + } + + scale_X = [0.0]; + shift_X = [0.0]; + if (intercept_status == 2) { + avg_X_cols = t(sum(X, 1)) / n; + var_X_cols = (t(sum(X ^ 2, 1)) - n * (avg_X_cols ^ 2)) / (n - 1); + is_unsafe = (var_X_cols <= 0); + scale_X = 1.0 / sqrt(var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X[m_ext - 1, 0] = as.matrix(1.0); + shift_X = (0.0 - avg_X_cols) * scale_X; + shift_X[m_ext - 1, 0] = as.matrix(0.0); + } else { + scale_X = fill(as.f64(1), m_ext, 1); + shift_X = fill(as.f64(0), m_ext, 1); + } + + lambda = scale_lambda * regularization; + beta_unscaled = fill(as.f64(0), m_ext, 1); + + if (max_iteration == 0) { + max_iteration = as.si64(m_ext); + } + + i = 0; + + if (verbose) { + print("Running the CG algorithm..."); + } + + r = (0.0 - t(X)) @ y; + + if (intercept_status == 2) { + r = as.matrix(scale_X * r + shift_X @ r[m_ext - 1, ]); + } + + p = (0.0 - r); + norm_r2 = sum(r ^ 2); + norm_r2_initial = norm_r2; + norm_r2_target = norm_r2_initial * tolerance ^ 2; + + if (verbose) { + print("||r|| initial value = "); print(sqrt(norm_r2_initial)); print(", target value = "); print(sqrt(norm_r2_target)); + } + + while (i < max_iteration && norm_r2 > norm_r2_target) { + ssX_p = [0.0]; + if (intercept_status == 2) { + ssX_p = scale_X * p; + ssX_p[m_ext - 1, ] = ssX_p[m_ext - 1, ] + t(shift_X) @ p; + } else { + ssX_p = as.matrix(p); + } + + q = t(X) @ (X @ ssX_p); + + if (intercept_status == 2) { + q = as.matrix(scale_X * q + shift_X @ q[m_ext - 1, ]); + } + + q = q + lambda * p; + a = norm_r2 / sum(p * q); + beta_unscaled = beta_unscaled + a * p; + r = r + a * q; + old_norm_r2 = norm_r2; + norm_r2 = sum(r ^ 2); + p = as.matrix((0.0 - r) + (norm_r2 / old_norm_r2) * p); + i = as.si64(i + 1); + + if (verbose) { + print("Iteration "); print(i); print(": ||r|| / ||r init|| = "); print(sqrt(norm_r2 / norm_r2_initial)); + } + + } + + if (verbose && i >= max_iteration) { + print("Warning: the maximum number of iterations has been reached."); + } + + beta = [0.0]; + if (intercept_status == 2) { + beta = scale_X * beta_unscaled; + beta[m_ext - 1, ] = beta[m_ext - 1, ] + t(shift_X) @ beta_unscaled; + } else { + beta = as.matrix(beta_unscaled); + } + + B = beta; + return as.matrix(B); +} + diff --git a/scripts/translated_scripts/lmDS.daph b/scripts/translated_scripts/lmDS.daph new file mode 100644 index 000000000..7f87732d4 --- /dev/null +++ b/scripts/translated_scripts/lmDS.daph @@ -0,0 +1,62 @@ +def m_lmDS(X:matrix, y:matrix, icpt:si64 /*= 0*/, reg:f64 /*= 0.0000001*/, verbose:bool /*= true*/) -> matrix { + intercept_status = icpt; + regularization = reg; + n = as.si64(nrow(X)); + m = as.si64(ncol(X)); + scale_lambda = [0.0]; + m_ext = 0; + if (intercept_status == 1 || intercept_status == 2) { + ones_n = fill(as.f64(1), n, 1); + X = cbind(X, ones_n); + m_ext = as.si64(ncol(X)); + scale_lambda = fill(as.f64(1), m_ext, 1); + scale_lambda[m_ext - 1, 0] = as.matrix(0.0); + } else { + scale_lambda = fill(as.f64(1), m, 1); + m_ext = as.si64(m); + } + + scale_X = [0.0]; + shift_X = [0.0]; + if (intercept_status == 2) { + avg_X_cols = t(sum(X, 1)) / n; + var_X_cols = (t(sum(X ^ 2, 1)) - n * (avg_X_cols ^ 2)) / (n - 1); + is_unsafe = (var_X_cols <= 0); + scale_X = 1.0 / sqrt(var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X[m_ext - 1, 0] = as.matrix(1.0); + shift_X = (0.0 - avg_X_cols) * scale_X; + shift_X[m_ext - 1, 0] = as.matrix(0.0); + } else { + scale_X = fill(as.f64(1), m_ext, 1); + shift_X = fill(as.f64(0), m_ext, 1); + } + + lambda = scale_lambda * regularization; + A = t(X) @ X; + b = t(X) @ y; + + if (intercept_status == 2) { + A = t(diagMatrix(scale_X) @ A + shift_X @ A[m_ext - 1, ]); + A = as.matrix(diagMatrix(scale_X) @ A + shift_X @ A[m_ext - 1, ]); + b = as.matrix(diagMatrix(scale_X) @ b + shift_X @ b[m_ext - 1, ]); + } + + A = as.matrix(A + diagMatrix(lambda)); + + if (verbose) { + print("Calling the Direct Solver..."); + } + + beta_unscaled = solve(A, b); + beta = [0.0]; + if (intercept_status == 2) { + beta = scale_X * beta_unscaled; + beta[m_ext - 1, ] = beta[m_ext - 1, ] + t(shift_X) @ beta_unscaled; + } else { + beta = as.matrix(beta_unscaled); + } + + B = beta; + return as.matrix(B); +} + diff --git a/scripts/translated_scripts/logSumExp.daph b/scripts/translated_scripts/logSumExp.daph new file mode 100644 index 000000000..5c39e5ce6 --- /dev/null +++ b/scripts/translated_scripts/logSumExp.daph @@ -0,0 +1,29 @@ +def m_logSumExp(M:matrix, margin:str /*= "none"*/) -> matrix { + + if (margin == "rows") { + ds = M - aggMax(M, 0); + rSumOfexp = sum(exp(ds), 0); + output = aggMax(M, 0) + log(rSumOfexp, 2); + } else { + + if (margin == "cols") { + ds = as.matrix(M - aggMax(M, 1)); + cSumOfexp = sum(exp(ds), 1); + output = as.matrix(aggMax(M, 1) + log(cSumOfexp, 2)); + } else { + + if (margin == "none") { + ds = as.matrix(M - aggMax(M)); + sumOfexp = sum(exp(ds)); + output = as.matrix(aggMax(M) + log(sumOfexp, 2)); + } else { + stop("invalid margin value expecting rows, cols or none found: " + margin); + } + + } + + } + + return as.matrix(output); +} + diff --git a/scripts/translated_scripts/mcc.daph b/scripts/translated_scripts/mcc.daph new file mode 100644 index 000000000..463f1f160 --- /dev/null +++ b/scripts/translated_scripts/mcc.daph @@ -0,0 +1,48 @@ +def computeMCC(confusionM:matrix) -> f64 { + TN = as.scalar(as.f64(confusionM[0, 0])); + FP = as.scalar(as.f64(confusionM[0, 1])); + FN = as.scalar(as.f64(confusionM[1, 0])); + TP = as.scalar(as.f64(confusionM[1, 1])); + + if (aggMin(sum(confusionM, 0)) == 0 || aggMin(sum(confusionM, 1)) == 0) { + mattCC = as.f64(0.0); + } else { + mattCC = as.f64((TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))); + } + + return as.f64(mattCC); +} + +def m_mcc(predictions:matrix, labels:matrix) -> f64 { + + if ((as.si64(ncells(labels)) > 0 && sum(labels) == 0)) { + stop("MCC Input Error: labels contains only zeros"); + } + + + if (as.si64(nrow(predictions)) != as.si64(nrow(labels))) { + stop("MCC Input Error: rows in predictions != rows in labels"); + } + + + if (aggMin(labels) != 0 || aggMin(predictions) != 0) { + stop("MCC Input Error: accepts 0/1 vector only"); + } + + + if (aggMin(labels) == aggMax(labels)) { + stop("MCC Input Error: labels contains single class"); + } + + + if (aggMax(labels) > 1 || aggMax(predictions) > 1) { + stop("MCC Input Error: accepts 0/1 vector only"); + } + + labels = as.matrix(labels + 1); + predictions = as.matrix(predictions + 1); + confM = ctable(labels, predictions, 2, 2); + mattCC = computeMCC(as.matrix(confM)); + return as.f64(mattCC); +} + diff --git a/scripts/translated_scripts/multiLogReg.daph b/scripts/translated_scripts/multiLogReg.daph new file mode 100644 index 000000000..f21b11a88 --- /dev/null +++ b/scripts/translated_scripts/multiLogReg.daph @@ -0,0 +1,75 @@ + + +def m_multiLogReg (X:matrix, Y:matrix, icpt:si64 /*= 2*/, tol:f64 /*= 0.000001*/, reg:f64 /*= 1.0*/, maxi:si64 /*= 100*/, maxii:si64 /*= 20*/, is_verbose:bool /*= true*/) -> matrix { + eta0 = 0.0001; + eta1 = 0.25; + eta2 = 0.75; + sigma1 = 0.25; + sigma2 = 0.5; + sigma3 = 4.0; + psi = 0.1; + + N = nrow(X); + D = ncol(X); + + if (is_verbose) { + print("BEGIN K-MEANS SCRIPT"); + } + + # Robustness for datasets with missing values (causing NaN gradients) + hasNaNs = sum(isNan(X)) > 0; + if (hasNaNs) { + if(is_verbose) { + print("multiLogReg: matrix X contains ", 0); + print("sum(isNan(X))", 0); + print(" missing values, replacing with 0."); + } + X = replace(X, nan, 0); + # TODO: I am not sure if NaN is recognized as a scalar like this. + } + + # Introduce the intercept, shift and rescale the columns of X if needed + if (icpt == 1 || icpt == 2) { + if (N == nrow(X)) { + N = nrow(X); + # TODO: This is the if statement in the orginal code, does it even do anything at all? + } + X = cbind(X, fill(1.0, N, 1)); + D = ncol(X); + } + + scale_lambda = fill(1.0, D, 1); + if (icpt == 1 || icpt == 2) { + scale_lambda[D - 1, 0] = [0.0]; + } + + if (icpt == 2) { + avg_X_cols = t(sum(X, 1)) / N; + var_X_cols = (t(sum(X ^ 2, 1)) - N * (avg_X_cols ^ 2)) / (N - 1); + is_unsafe = var_X_cols <= 0; + scale_X = 1.0 / sqrt(var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X[D - 1, 0] = [1.0]; + shift_X = -avg_X_cols * scale_X; + shift_X[D - 1, 0] = [0.0]; + rowSums_X_sq = (X ^ 2) @ (scale_X ^ 2) + X @ (2 * scale_X * shift_X) + sum(shift_X ^ 2); + } else { + scale_X = fill(1.0, D, 1); + shift_X = fill(0.0, D, 1); + rowSums_X_sq = sum(X ^ 2); + } + + # Convert "Y" into indicator matrix: + max_y = aggMax(Y); + print(max_y); + + if (aggMin(Y) <= 0) { + # Category labels "0", "-1" etc, are converted into the largest label + Y = Y <= 0 ? max_y + 1 : Y; + max_y = max_y + 1; + } + Y = ctable(seq(1.0, N, 1.0), t(Y), N, max_y); + print(Y); + + return X; +} + diff --git a/scripts/translated_scripts/naiveBayesPredict.daph b/scripts/translated_scripts/naiveBayesPredict.daph new file mode 100644 index 000000000..47bb7ac17 --- /dev/null +++ b/scripts/translated_scripts/naiveBayesPredict.daph @@ -0,0 +1,10 @@ +def m_naiveBayesPredict(X:matrix, P:matrix, C:matrix) -> matrix, matrix { + numRows = as.si64(nrow(X)); + model = cbind(C, P); + ones = fill(as.f64(1), numRows, 1); + X_w_ones = cbind(X, ones); + YRaw = X_w_ones @ t(log(model, 2)); + Y = idxMax(YRaw, 0); + return as.matrix(YRaw), as.matrix(Y); +} + diff --git a/scripts/translated_scripts/pcaInverse.daph b/scripts/translated_scripts/pcaInverse.daph new file mode 100644 index 000000000..f05092557 --- /dev/null +++ b/scripts/translated_scripts/pcaInverse.daph @@ -0,0 +1,15 @@ +def m_pcaInverse(Y:matrix, Clusters:matrix, Centering:matrix /*= fill(as.f64(0), 1, 1)*/, ScaleFactor:matrix /*= fill(as.f64(0), 1, 1)*/) -> matrix { + X = Y @ t(Clusters); + + if (as.si64(nrow(ScaleFactor)) > 0 && as.si64(ncol(ScaleFactor)) > 0) { + X = as.matrix(X * ScaleFactor); + } + + + if (as.si64(nrow(Centering)) > 0 && as.si64(ncol(Centering)) > 0) { + X = as.matrix(X + Centering); + } + + return as.matrix(X); +} + diff --git a/scripts/translated_scripts/raSelection.daph b/scripts/translated_scripts/raSelection.daph new file mode 100644 index 000000000..494d16e6a --- /dev/null +++ b/scripts/translated_scripts/raSelection.daph @@ -0,0 +1,6 @@ +def m_raSelection(X:matrix, col:si64, op:str, val:f64) -> matrix { + I = op == "==" ? X[, col - 1] == val : op == "!=" ? X[, col - 1] != val : op == "<" ? X[, col - 1] < val : op == ">" ? X[, col - 1] > val : op == "<=" ? X[, col - 1] <= val : X[, col - 1] >= val; + Y = X[[I, ]]; + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/rand.daph b/scripts/translated_scripts/rand.daph new file mode 100644 index 000000000..22bd01d6a --- /dev/null +++ b/scripts/translated_scripts/rand.daph @@ -0,0 +1,5 @@ +def m_rand(X:matrix) -> matrix { + Y = rand(10, 20, 0, 1, 0.2000000000000000111, 1); + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/scaleMinMax.daph b/scripts/translated_scripts/scaleMinMax.daph new file mode 100644 index 000000000..3c5078b78 --- /dev/null +++ b/scripts/translated_scripts/scaleMinMax.daph @@ -0,0 +1,10 @@ +def m_scaleMinMax(X:matrix) -> matrix { + print("Deprecated scaleMinMax use normalize instead"); + cmin = aggMin(X, 1); + cmax = aggMax(X, 1); + diff = (cmax - cmin); + diff = replace(diff, 0, 1); + Y = (X - cmin) / diff; + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/test.daph b/scripts/translated_scripts/test.daph new file mode 100644 index 000000000..13e2c4b9c --- /dev/null +++ b/scripts/translated_scripts/test.daph @@ -0,0 +1,7 @@ +def m_test(X:matrix) -> matrix { + X = fill(as.f64(0), 10, 10); + a = sum(X == 1) >= 0; + print(a); + return X; +} + diff --git a/scripts/translated_scripts/test_kmeans.daph b/scripts/translated_scripts/test_kmeans.daph new file mode 100644 index 000000000..69986591a --- /dev/null +++ b/scripts/translated_scripts/test_kmeans.daph @@ -0,0 +1,28 @@ +import "kmeans_final.daph"; + +seed = 126178; +num_points = 100; + +// Define the centers of the clusters +center1 = [1.0, 2.0]; +center2 = [3.0, 4.0]; + +noise1 = rand(num_points, 2, -0.5, 0.5, 1.0, seed); +noise2 = rand(num_points, 2, -0.5, 0.5, 1.0, seed); + +cluster1 = noise1 + t(center1); +cluster2 = noise2 + t(center2); + +data = rbind(cluster1, cluster2); + +k = 3; +runs = 100; +max_iter = 100; +eps = 0.000001; +is_verbose = true; +avg_sample_size_per_centroid = 3; + +X = readMatrix($inPath); + +C, Y = kmeans_final.m_kmeans(as.matrix(X), k, runs, max_iter, eps, is_verbose, avg_sample_size_per_centroid, seed); +print(C); diff --git a/scripts/translated_scripts/test_logSumExp.daph b/scripts/translated_scripts/test_logSumExp.daph new file mode 100644 index 000000000..27c77ca3d --- /dev/null +++ b/scripts/translated_scripts/test_logSumExp.daph @@ -0,0 +1,9 @@ +import "logSumExp.daph"; + +M = [100000, 2, 3, 4, 5, 0.00001]; + +margin = "rows"; + +output = logSumExp.m_logSumExp(as.matrix(M), margin); + +print(output); diff --git a/scripts/translated_scripts/test_multiLogReg.daph b/scripts/translated_scripts/test_multiLogReg.daph new file mode 100644 index 000000000..6b380e814 --- /dev/null +++ b/scripts/translated_scripts/test_multiLogReg.daph @@ -0,0 +1,17 @@ +import "multiLogReg.daph"; + +n_samples = 3; +n_features = 2; +n_classes = 3; +seed = 1234; + +X = rand(n_samples, n_features, 0.001, 1.0, 0.8, seed); +Y = rand(1, n_samples, 0, n_classes, 1.0 / n_classes, seed); +icpt = 2; +tol = 0.000001; +reg = 1.0; +maxi = 100; +maxii = 20; +verbose = 1; + +multiLogReg.m_multiLogReg(X, as.matrix(Y), icpt, tol, reg, maxi, maxii, as.bool(verbose)); diff --git a/scripts/translated_scripts/test_rand.daph b/scripts/translated_scripts/test_rand.daph new file mode 100644 index 000000000..018b7f524 --- /dev/null +++ b/scripts/translated_scripts/test_rand.daph @@ -0,0 +1,6 @@ +import "rand.daph"; + +X = [1.0]; + +Y = rand.m_rand(X); +print(Y); diff --git a/scripts/translated_scripts/toOneHot.daph b/scripts/translated_scripts/toOneHot.daph new file mode 100644 index 000000000..2ca7e02ab --- /dev/null +++ b/scripts/translated_scripts/toOneHot.daph @@ -0,0 +1,10 @@ +def m_toOneHot(X:matrix, numClasses:si64) -> matrix { + + if (numClasses < aggMax(X)) { + stop("numClasses must be >= largest value in X to prevent cropping"); + } + + Y = ctable(seq(as.f64(1), as.si64(nrow(X)), 1 <= as.si64(nrow(X)) ? 1 : -1), X, as.si64(nrow(X)), numClasses); + return as.matrix(Y); +} + diff --git a/scripts/translated_scripts/tomeklink.daph b/scripts/translated_scripts/tomeklink.daph new file mode 100644 index 000000000..545b0cdb6 --- /dev/null +++ b/scripts/translated_scripts/tomeklink.daph @@ -0,0 +1,56 @@ +def get_nn(X:matrix) -> matrix { + nn = fill(as.f64(0), as.si64(nrow(X)), 1); + for(i in 1:as.si64(nrow(X))) { + dists = sum((X - X[i - 1, ]) ^ 2, 0); + dists[i - 1, ] = nan; + nn[i - 1, 0] = idxMax(t(dists), 0); + } + return as.matrix(nn); +} + +def get_links(X:matrix, y:matrix, majority_label:f64) -> matrix { + tomek_links = fill(as.f64((-1.0)), 1, 1); + + if (aggMax(y) <= 2) { + nn = get_nn(as.matrix(X)); + perm = ctable(seq(as.f64(1), as.si64(nrow(y)), 1 <= as.si64(nrow(y)) ? 1 : -1), nn, as.si64(nrow(y)), as.si64(nrow(y))); + nn_labels = perm @ y; + links = (y != majority_label) && (nn_labels == majority_label); + tomek_links = as.matrix((ctable(nn, 1, links, as.si64(nrow(y)), 1) > 0)); + } + + return as.matrix(tomek_links); +} + +def m_tomeklink(X:matrix, y:matrix) -> matrix, matrix, matrix { + ymin = aggMin(y); + + if (ymin == 0) { + y = as.matrix(y + 1); + } + + label = ctable(y, 1); + majority_label = as.scalar(as.f64(idxMax(t(label), 0))); + tomek_links = get_links(as.matrix(X), as.matrix(y), as.f64(majority_label)); + drop_idx = [0.0]; + y_under = [0.0]; + X_under = [0.0]; + if (sum(tomek_links == 0) > 0) { + drop_idx = tomek_links * seq(as.f64(1), as.si64(nrow(X)), 1 <= as.si64(nrow(X)) ? 1 : -1); + X_under = X[[(tomek_links == 0), ]]; + y_under = y[[(tomek_links == 0), ]]; + drop_idx = drop_idx[[tomek_links, ]]; + } else { + X_under = as.matrix(X); + y_under = as.matrix(y); + drop_idx = as.matrix(nan); + } + + + if (ymin == 0) { + y_under = as.matrix(y_under - 1); + } + + return as.matrix(X_under), as.matrix(y_under), as.matrix(drop_idx); +} + diff --git a/scripts/translated_scripts/winsorizeApply.daph b/scripts/translated_scripts/winsorizeApply.daph new file mode 100644 index 000000000..ce9906b78 --- /dev/null +++ b/scripts/translated_scripts/winsorizeApply.daph @@ -0,0 +1,5 @@ +def m_winsorizeApply(X:matrix, qLower:matrix, qUpper:matrix) -> matrix { + Y = min(max(X, qLower), qUpper); + return as.matrix(Y); +} +