-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict_ie_fn.R
86 lines (74 loc) · 2.86 KB
/
predict_ie_fn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Predicts ecological integrity based on input data and trained Bayesian network
# This code is meant to be run in a cluster
library('gRain')
#' Function to predict ecological integrity
#' @param prior RData trained Bayesian network
#' @param input_csv Path to cases csv file to be evaluated
#' @param total_rows Number of total cases
#' @param no_of_partitions Number of partitions to split cases
#' @param i_cluster i-th partition of the dataframe
#' @param out_path Output folder path
predict_ie_bn <- function(
prior,
input_csv,
total_rows,
no_of_partitions=200,
i_cluster,
out_path
) {
if (i_cluster <= 0 || i_cluster > no_of_partitions) {
stop("Cluster number out of range")
}
col_names <- colnames(read.csv(input_csv, nrows = 1))
# split data
size <- trunc(total_rows/no_of_partitions)
n_ini <- (i_cluster-1)*size + 1
cat(format(Sys.time(), "%Y/%m/%e %H:%M:%S")," - Leyendo datos del cluster ", i_cluster, "\n")
if ( i_cluster == no_of_partitions ) {
df <- read.csv(input_csv,
skip = n_ini,
header = FALSE,
col.names = col_names)
} else {
df <- read.csv(input_csv,
skip = n_ini,
nrows = size,
header = FALSE,
col.names = col_names)
}
gc(reset = TRUE)
# Prediction
cat(format(Sys.time(), "%Y/%m/%e %H:%M:%S")," - Empieza prediccion del cluster ", i_cluster, "\n")
prediction <- predict(prior,
response="hemerobia",
newdata=df,
type="distribution")
probabilities <- prediction$pred$hemerobia
# Raster with standardized expectancy
expectancy <- probabilities %*% as.numeric(colnames(probabilities))
expectancy <- (18-expectancy)/(18)
df_exp <- data.frame(x=df$x,y=df$y,ie=expectancy)
# Raster with most probable category
category <- colnames(probabilities)[apply(probabilities,1,which.max)]
df_cat <- data.frame(x=df$x,y=df$y,ie=category)
# Save rasters
cat(format(Sys.time(), "%Y/%m/%e %H:%M:%S"), " - Escribiendo resultados del cluster ", i_cluster, "\n")
write.csv(df_exp,file.path(out_path,'df_expectancy',paste0('df_exp_',i_cluster,'.csv')),
row.names = FALSE)
write.csv(df_cat,file.path(out_path,'df_categorical',paste0('df_cat_',i_cluster,'.csv')),
row.names = FALSE)
}
args <- commandArgs(TRUE)
input_csv <- 'df_input.csv'
total_rows <- as.numeric(system(paste("cat", input_csv, "| wc -l"), intern = TRUE)) - 1
n_parts <- 4000
prior <- readRDS('prior.RData')
i_cluster <- as.numeric(args[1])
out_path <- 'output'
cat(format(Sys.time(), "%Y/%m/%e %H:%M:%S"), " - Procesando particion ", i_cluster, " de ", n_parts, "\n")
predict_ie_bn(prior,
input_csv,
total_rows,
n_parts,
i_cluster,
out_path)