-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdsample.c
117 lines (96 loc) · 2.96 KB
/
dsample.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include "cfile.h"
#include "snames.h"
static int usage() {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: yame dsample [options] <in.cx> <out.cx>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -s [N] seed for sampling.\n");
fprintf(stderr, " -N [N] number of records to sample. (default: 100).\n");
fprintf(stderr, " When higher than available, capped to available.\n");
fprintf(stderr, " -h This help\n");
fprintf(stderr, "\n");
return 1;
}
void Fisher_Yates_shuffle(uint64_t *array, uint64_t n) {
for (uint64_t i = 0; i<n-1; ++i) {
uint64_t j = i + rand() / (RAND_MAX / (n-i) + 1);
uint64_t t = array[j];
array[j] = array[i];
array[i] = t;
}
}
cdata_t* downsample(cdata_t *c0, cdata_t *c, uint64_t N) {
if (c0->fmt != '3') {
fprintf(stderr, "[%s:%d] Input must be of format 3.\n", __func__, __LINE__);
fflush(stderr);
exit(1);
}
decompress(c0, c);
uint64_t N_indices = 0;
for (uint64_t i=0; i<c->n; ++i)
if (f3_get_mu(c, i)) N_indices++;
if (N >= N_indices) return c; // more elements to sample than exist
uint64_t *indices = calloc(N_indices, sizeof(uint64_t));
uint64_t j = 0;
for (uint64_t i = 0; i<c->n; ++i)
if (f3_get_mu(c, i)) indices[j++] = i;
Fisher_Yates_shuffle(indices, N_indices);
uint8_t *to_include = calloc(c->n/8+1, 1);
for (uint64_t j = 0; j<N; ++j)
to_include[indices[j]>>3] |= (1<<(indices[j]&0x7));
for (uint64_t i = 0; i<c->n; ++i) {
if (f3_get_mu(c, i) && !(to_include[i>>3]&(1<<(i&0x7)))) {
f3_set_mu(c, i, 0, 0);
}
}
free(indices); free(to_include);
return c;
}
int main_dsample(int argc, char *argv[]) {
int c;
unsigned seed = (unsigned int)time(NULL);
uint64_t N = 100;
while ((c = getopt(argc, argv, "s:N:h"))>=0) {
switch (c) {
case 's': seed = strtoul(optarg, NULL, 10); break;
case 'N': N = strtoul(optarg, NULL, 10); break;
case 'h': return usage(); break;
default: usage(); wzfatal("Unrecognized option: %c.\n", c);
}
}
if (optind + 1 > argc) {
usage();
wzfatal("Please supply input file.\n");
}
char *fname = argv[optind];
char *fname_out = NULL;
if (argc >= optind + 2)
fname_out = strdup(argv[optind+1]);
BGZF* fp;
if (fname_out) fp = bgzf_open2(fname_out, "wb");
else fp = bgzf_dopen(fileno(stdout), "wb");
if (fp == NULL) {
fprintf(stderr, "[%s:%d] Error opening file for writing: %s\n", __func__, __LINE__, fname_out);
fflush(stderr);
exit(1);
}
cfile_t cf = open_cfile(fname);
srand(seed);
for (uint64_t kq=0;;++kq) {
cdata_t c = read_cdata1(&cf);
if (c.n == 0) break;
cdata_t c2 = {0};
downsample(&c, &c2, N);
if (!c2.compressed) cdata_compress(&c2);
cdata_write1(fp, &c2);
free_cdata(&c); free_cdata(&c2);
}
bgzf_close(cf.fh);
bgzf_close(fp);
if (fname_out) free(fname_out);
return 0;
}