forked from se-sic/coronet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
util-networks-misc.R
239 lines (193 loc) · 9.83 KB
/
util-networks-misc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
## This file is part of coronet, which is free software: you
## can redistribute it and/or modify it under the terms of the GNU General
## Public License as published by the Free Software Foundation, version 2.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
##
## Copyright 2016-2017 by Sofie Kemper <[email protected]>
## Copyright 2016-2017 by Claus Hunsen <[email protected]>
## Copyright 2016-2018 by Thomas Bock <[email protected]>
## Copyright 2020 by Thomas Bock <[email protected]>
## Copyright 2017 by Angelika Schmid <[email protected]>
## Copyright 2019 by Jakob Kronawitter <[email protected]>
## Copyright 2019-2020 by Anselm Fehnker <[email protected]>
## All Rights Reserved.
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Libraries ---------------------------------------------------------------
requireNamespace("parallel") # for parallel computation
requireNamespace("igraph") # networks
requireNamespace("Matrix") # for sparse matrices
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Get active authors -----------------------------------------------------
#' Get all author names that are active in at least one of the networks.
#'
#' @param networks the list of networks from which the author names are wanted
#' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE]
#'
#' @return the list of author names
get.author.names.from.networks = function(networks, globally = TRUE) {
## for each network, get a list of authors that are in this network
active.authors.list = lapply(networks, function(network) {
active.authors = igraph::V(network)$name
return(active.authors)
})
if (globally) {
## flatten the list of lists to one list of authors
active.authors = unlist(active.authors.list, recursive = FALSE)
## remove distracting named list members
names(active.authors) = NULL
## remove duplicates and order alphabetically ascending
active.authors = active.authors[!duplicated(active.authors)]
active.authors = sort(active.authors)
return(active.authors)
} else {
return(active.authors.list)
}
}
#' Get all author names that are active in at least one of the data sources during the data ranges.
#'
#' @param data.ranges the list of the data ranges
#' @param data.sources the data sources from which the author names should be retrieved,
#' can be either \code{"commits"}, \code{"mails"}, or \code{"issues"},
#' or any combination of them [default: c("commits", "mails", "issues")]
#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE]
#'
#' @return the list of author names
get.author.names.from.data = function(data.ranges, data.sources = c("commits", "mails", "issues"), globally = TRUE) {
data.sources = match.arg.or.default(data.sources, several.ok = TRUE)
## for each range, get the authors who have been active on at least one data source in this range
active.authors.list = lapply(data.ranges, function(range.data) {
active.authors = range.data$get.authors.by.data.source(data.sources)
active.authors.names = active.authors[["author.name"]]
return(active.authors.names)
})
if (globally) {
## flatten the list of lists to one list of authors
active.authors = unlist(active.authors.list, recursive = FALSE)
## remove distracting named list members
names(active.authors) = NULL
## remove duplicates and order alphabetically ascending
active.authors = active.authors[!duplicated(active.authors)]
active.authors = sort(active.authors)
return(active.authors)
} else {
return(active.authors.list)
}
}
## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Adjacency matrices ----------------------------------------------------
#' Get a sparse expanded adjacency matrix for network.
#'
#' The adjacency matrix is expanded as it may contain rows and columns for authors which are not part of the network
#' but given in the \code{authors} parameter. However, this also means that authors present in the network
#' but not given in the \code{authors} parameter are not contained in the expanded adjacency matrix.
#'
#' @param network the given network
#' @param authors all authors that are wanted in the adjacency matrix
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the sparse adjacency matrix of the network
get.expanded.adjacency = function(network, authors, weighted = FALSE) {
## create an empty sparse matrix with the right size
matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE)
matrix = as(matrix, "dgTMatrix")
## add row and column names
rownames(matrix) = authors
colnames(matrix) = authors
if (igraph::vcount(network) > 0) {
if (weighted) {
## get the weighted adjacency matrix for the current network
matrix.data = igraph::get.adjacency(network, attr = "weight")
} else {
## get the unweighted adjacency matrix for the current network
matrix.data = igraph::get.adjacency(network)
}
## order the adjacency matrix
if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work
matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))]
}
## save the activity data per author
if (nrow(matrix.data) > 0) {
matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data
}
if (!weighted) {
matrix[matrix > 0] = 1
}
}
return(matrix)
}
#' Calculates a sparse adjacency matrix for each network in the list.
#' All adjacency matrices are expanded in such a way that the use the same set
#' of authors derived from all networks in the list.
#'
#' @param networks list of networks
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the list of adjacency matrices
get.expanded.adjacency.matrices = function(networks, weighted = FALSE){
authors = get.author.names.from.networks(networks)
adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted)
return(adjacency.matrices)
}
#' Gets a list of networks, converts them to sparse adjacency matrices, and sums up the adjacency matrices cumulatively.
#' This means that the first entry of the returned list is just the adjacency matrix from the first network,
#' the second entry is the sum of the first and the second entry, and so on.
#'
#' @param networks list of networks
#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
#'
#' @return the list of cumulated adjacency matrices
get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) {
## get expanded adjacency matrices first
matrices = get.expanded.adjacency.matrices(networks, weighted)
## pair-wise sum of matrices: m.cumul(n) = m.cumul(m - 1) + m
## (intermediate results consecutively stored in matrices.cumulated)
matrices.cumulated = list(matrices[[1]]) # first one is complete already
if (length(matrices) > 1) {
for (m in 2:(length(matrices))){
matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]]
rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m - 1]])
colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m - 1]])
if (!weighted) {
## search for a non-zero entry and set them to an arbitray number (e.g., 42)
## to force that all non-zero entries are correctly set to 1 afterwards
not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE)
if (nrow(not.zero.idxs) > 0) {
first.not.zero.idx = not.zero.idxs[1, ]
names(first.not.zero.idx) = c("row", "col")
matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42
matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i))
}
}
}
}
return(matrices.cumulated)
}
#' Converts a list of adjacency matrices to an array.
#'
#' @param adjacency.list the list of adjacency matrices
#'
#' @return the converted array
convert.adjacency.matrix.list.to.array = function(adjacency.list){
## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result
array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list)))
rownames(array) = rownames(adjacency.list[[1]])
colnames(array) = colnames(adjacency.list[[1]])
## copy the activity values from the adjacency matrices in the list to the corresponding array slices
for (i in seq_along(adjacency.list)){
adjacency = adjacency.list[[i]]
activity.indices = which(adjacency != 0, arr.ind = TRUE)
for (j in 1:nrow(activity.indices)){
array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2]), i] =
adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])]
}
}
return(array)
}