forked from GGiecold-zz/Density_Sampling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDensity_Sampling.py
354 lines (265 loc) · 13.2 KB
/
Density_Sampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python
# Density_Sampling/Density_Sampling.py
# Author: Gregory Giecold for the GC Yuan Lab
# Affiliation: Harvard University
# Contact: [email protected], [email protected]
"""For a data-set comprising a mixture of rare and common populations,
density sampling gives equal weights to selected representatives
of those distinct populations.
Density sampling is a balancing act between signal and noise. Indeed, while
it increases the prevalence of rare populations, it also increases the prevalence
of noisy sample points that would happen to have their local densities larger than
an outlier density computed by Density_Sampling.
An illustration of how to use the present module is in order:
>>> iris = datasets.load_iris()
>>> Y = iris.target
>>> X_reduced = PCA(n_components = 3).fit_transform(iris.data)
>>> plot_PCA(X_reduced, Y, 'the whole Iris data-set')
>>> sampled_indices = density_sampling(X_reduced, metric = 'euclidean', desired_samples = 50)
>>> downsampled_X_reduced = X_reduced[sampled_indices, :]
>>> downsampled_Y = Y[sampled_indices]
>>> plot_PCA(downsampled_X_reduced, downsampled_Y, 'the Iris data-set\ndown-sampled to about 50 samples')
Reference
---------
Giecold, G., Marco, E., Trippa, L. and Yuan, G.-C.,
"Robust Lineage Reconstruction from High-Dimensional Single-Cell Data".
ArXiv preprint [q-bio.QM, stat.AP, stat.CO, stat.ML]: http://arxiv.org/abs/1601.02748
"""
import numbers
import numpy as np
import operator
import psutil
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.neighbors import radius_neighbors_graph
from sys import exit
from tempfile import NamedTemporaryFile
__all__ = ['get_local_densities', 'density_sampling']
def memory():
"""Determine memory specifications of the machine.
Returns
-------
mem_info : dictonary
Holds the current values for the total, free and used memory of the system.
"""
mem_info = dict()
for k, v in psutil.virtual_memory().__dict__.iteritems():
mem_info[k] = int(v)
return mem_info
def get_chunk_size(N, n):
"""Given a two-dimensional array with a dimension of size 'N',
determine the number of rows or columns that can fit into memory.
Parameters
----------
N : int
The size of one of the dimensions of a two-dimensional array.
n : int
The number of arrays of size 'N' times 'chunk_size' that can fit in memory.
Returns
-------
chunk_size : int
The size of the dimension orthogonal to the one of size 'N'.
"""
mem_free = memory()['free']
if mem_free > 60000000:
chunk_size = int(((mem_free - 10000000) * 1000) / (4 * n * N))
return chunk_size
elif mem_free > 40000000:
chunk_size = int(((mem_free - 7000000) * 1000) / (4 * n * N))
return chunk_size
elif mem_free > 14000000:
chunk_size = int(((mem_free - 2000000) * 1000) / (4 * n * N))
return chunk_size
elif mem_free > 8000000:
chunk_size = int(((mem_free - 1400000) * 1000) / (4 * n * N))
return chunk_size
elif mem_free > 2000000:
chunk_size = int(((mem_free - 900000) * 1000) / (4 * n * N))
return chunk_size
elif mem_free > 1000000:
chunk_size = int(((mem_free - 400000) * 1000) / (4 * n * N))
return chunk_size
else:
print("\nERROR: Density_Sampling: get_chunk_size: this machine does not "
"have enough free memory.\n")
exit(1)
def median_min_distance(data, metric):
"""This function computes a graph of nearest-neighbors for each sample point in
'data' and returns the median of the distribution of distances between those
nearest-neighbors, the distance metric being specified by 'metric'.
Parameters
----------
data : array of shape (n_samples, n_features)
The data-set, a fraction of whose sample points will be extracted
by density sampling.
metric : string
The distance metric used to determine the nearest-neighbor to each data-point.
The DistanceMetric class defined in scikit-learn's library lists all available
metrics.
Returns
-------
median_min_dist : float
The median of the distribution of distances between nearest-neighbors.
"""
data = np.atleast_2d(data)
nearest_distances = kneighbors_graph(data, 1, mode = 'distance', metric = metric, include_self = False).data
median_min_dist = np.median(nearest_distances, overwrite_input = True)
return round(median_min_dist, 4)
def get_local_densities(data, kernel_mult = 2.0, metric = 'manhattan'):
"""For each sample point of the data-set 'data', estimate a local density in feature
space by counting the number of neighboring data-points within a particular
region centered around that sample point.
Parameters
----------
data : array of shape (n_samples, n_features)
The data-set, a fraction of whose sample points will be extracted
by density sampling.
kernel_mult : float, optional (default = 2.0)
The kernel multiplier, which determine (in terms of the median of the distribution
of distances among nearest neighbors) the extent of the regions centered
around each sample point to consider for the computation of the local density
associated to that particular sample point.
metric : string, optional (default = 'manhattan')
The distance metric used to determine the nearest-neighbor to each data-point.
The DistanceMetric class defined in scikit-learn's library lists all available
metrics.
Returns
-------
local_densities : array of shape (n_samples,)
The i-th entry of this vector corresponds to the local density of the i-th sample
point in the order of the rows of 'data'.
"""
data = np.atleast_2d(data)
assert isinstance(kernel_mult, numbers.Real) and kernel_mult > 0
kernel_width = kernel_mult * median_min_distance(data, metric)
N_samples = data.shape[0]
if 8.0 * get_chunk_size(N_samples, 1) > N_samples:
A = radius_neighbors_graph(data, kernel_width, mode = 'connectivity', metric = metric, include_self = True)
rows, _ = A.nonzero()
with NamedTemporaryFile('w', delete = True, dir = './') as file_name:
fp = np.memmap(file_name, dtype = int, mode = 'w+', shape = rows.shape)
fp[:] = rows[:]
_, counts = np.unique(fp, return_counts = True)
local_densities = np.zeros(N_samples, dtype = int)
for i in xrange(N_samples):
local_densities[i] = counts[i]
else:
local_densities = np.zeros(N_samples, dtype = int)
chunks_size = get_chunk_size(N_samples, 2)
for i in xrange(0, N_samples, chunks_size):
chunk = data[i:min(i + chunks_size, N_samples)]
D = pairwise_distances(chunk, data, metric, n_jobs = 1)
D = (D <= kernel_width)
local_densities[i + np.arange(min(chunks_size, N_samples - i))] = D.sum(axis = 1)
return local_densities
def density_sampling(data, local_densities = None, metric = 'manhattan',
kernel_mult = 2.0, outlier_percentile = 0.01,
target_percentile = 0.05, desired_samples = None):
"""The i-th sample point of the data-set 'data' is selected by density sampling
with a probability given by:
| 0 if outlier_density > LD[i];
P(keep the i-th data-point) = | 1 if outlier_density <= LD[i] <= target_density;
| target_density / LD[i] if LD[i] > target_density.
Here 'LD[i]' denotes the local density of the i-th sample point of the data-set,
whereas 'outlier_density' and 'target_density' are computed as particular percentiles
of that distribution of local densities.
Parameters
----------
data : array of shape (n_samples, n_features)
The data-set, a fraction of whose sample points will be extracted
by density sampling.
local_densities : array of shape (n_samples,), optional (default = None)
The i-th entry of this vector corresponds to the local density of the i-th sample
point in the order of the rows of 'data'.
metric : string, optional (default = 'manhattan')
The distance metric used to determine the nearest-neighbor to each data-point.
The DistanceMetric class defined in scikit-learn's library lists all available
metrics.
kernel_mult : float, optional (default = 2.0)
The kernel multiplier, which determine (in terms of the median of the distribution
of distances among nearest neighbors) the extent of the regions centered
around each sample point to consider for the computation of the local density
associated to that particular sample point.
outlier_percentile : float, optional (default = 0.01)
Specify the outlier density as a percentile of the distribution of local densities.
target_percentile : float, optional (default = 0.05)
Specifiy the target density as a percentile of the distribution of local densities.
Relevant only if 'desired_samples' is left unspecified.
desired_samples : int, optional (default = None)
The number of samples to be selected from the whole data-set such that members
of rare populations and members of more common populations are roughly
equally represented. To that purpose, a target density is computed that to selects about
'desired_samples' data-points.
Returns
-------
samples_kept : array of shape (n_selected_samples,)
If the 'i'-th sample point of 'data' has been selected by a given instance of
density sampling, number 'i' is featured in the array returned by
the present function.
"""
random_state = np.random.RandomState()
data = np.atleast_2d(data)
for x in (kernel_mult, outlier_percentile, target_percentile):
assert isinstance(x, numbers.Real) and x > 0
for x in (outlier_percentile, target_percentile):
assert x <= 1.0
if local_densities is None:
local_densities = get_local_densities(data, kernel_mult, metric)
if reduce(operator.mul, local_densities.shape, 1) != max(local_densities.shape):
raise ValueError("\nERROR: Density_Sampling: density_sampling: problem with "
"the dimensions of the vector of local densities provided.\n")
else:
local_densities = np.reshape(local_densities, local_densities.size)
outlier_density = np.percentile(local_densities, outlier_percentile)
target_density = np.percentile(local_densities, target_percentile)
samples_kept = np.where(local_densities > outlier_density)[0]
N_kept = samples_kept.size
local_densities = local_densities[samples_kept]
if desired_samples is None:
probs = np.divide(target_density + 0.0, local_densities)
ind = np.where(probs > random_state.uniform(size = N_kept))[0]
samples_kept = samples_kept[ind]
elif desired_samples <= N_kept:
sorted_densities = np.sort(local_densities)
temp = np.reciprocal(sorted_densities[::-1].astype(float))
cdf = np.cumsum(temp)[::-1]
target_density = (desired_samples + 0.0) / cdf[0]
if target_density > sorted_densities[0]:
temp = desired_samples - np.arange(1.0, N_kept + 1.0)
possible_targets = np.divide(temp, cdf)
ind = np.argmax(possible_targets < sorted_densities)
target_density = possible_targets[ind]
probs = np.divide(target_density + 0.0, local_densities)
ind = np.where(probs > random_state.uniform(size = N_kept))[0]
samples_kept = samples_kept[ind]
else:
print("\nERROR: Density_Sampling: density_sampling: 'desired_samples' has been "
"assigned a value of {desired_samples}, larger than {N_kept}, "
"the number of samples whose local densities are high enough "
"(i.e. excluded are the local densities in the lowest {outlier_percentile} "
"percentile).\n".format(**locals()))
exit(1)
return samples_kept
if __name__ == '__main__':
import doctest
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
from time import sleep
def plot_PCA(X_reduced, Y, title):
fig = plt.figure(1, figsize = (10, 8))
ax = Axes3D(fig, elev = -150, azim = 110)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c = Y,
cmap = plt.cm.Paired)
ax.set_title('First three PCA direction for {title}'.format(**locals()))
ax.set_xlabel('1st eigenvector')
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel('2nd eigenvector')
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel('3rd eigenvector')
ax.w_zaxis.set_ticklabels([])
plt.show(block = False)
sleep(3)
plt.close()
doctest.testmod()