-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample.cu
83 lines (68 loc) · 2.31 KB
/
example.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include <iostream>
#include <kiss/kiss.cuh>
#include <helpers/cuda_helpers.cuh>
#include <helpers/timers.cuh>
template<class T, class Rng>
HOSTQUALIFIER INLINEQUALIFIER
void uniform_distribution(
T * const out,
const std::uint64_t n,
const std::uint32_t seed) noexcept
{
// execute kernel
helpers::lambda_kernel<<<4096, 32>>>
([=] DEVICEQUALIFIER
{
const std::uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
// generate initial local seed per thread
const std::uint32_t local_seed =
kiss::hashers::MurmurHash<std::uint32_t>::hash(seed+tid);
Rng rng{local_seed};
// grid-stride loop
const auto grid_stride = blockDim.x * gridDim.x;
for(std::uint64_t i = tid; i < n; i += grid_stride)
{
// generate random element and write to output
out[i] = rng.template next<T>();
}
})
; CUERR
}
// This example shows the easy generation of gigabytes of uniform random values
// in only a few milliseconds.
int main ()
{
// define the data types to be generated
using data_t = std::uint64_t;
using rng_t = kiss::Kiss<data_t>;
// number of values to draw
static constexpr std::uint64_t n = 1UL << 28;
// random seed
static constexpr std::uint32_t seed = 42;
// allocate host memory for the result
data_t * data_h = nullptr;
cudaMallocHost(&data_h, sizeof(data_t)*n); CUERR
// allocate GPU memory for the result
data_t * data_d = nullptr;
cudaMalloc(&data_d, sizeof(data_t)*n); CUERR
// initialize th allocated memory (contents my inner paranoia)
helpers::GpuTimer memset_timer("memset");
cudaMemset(data_d, 0, sizeof(data_t)*n); CUERR
memset_timer.print_throughput(sizeof(data_t), n);
// generate uniform random numbers and measure throughput
helpers::GpuTimer generate_timer("generate random");
uniform_distribution<data_t, rng_t>(
data_d,
n,
seed); CUERR
generate_timer.print_throughput(sizeof(data_t), n);
cudaMemcpy(data_h, data_d, sizeof(data_t)*n, D2H); CUERR
// do something with drawn random numbers
for(std::uint64_t i = 0; i < 10; i++)
{
std::cout << data_h[i] << std::endl;
}
// free any allocated memory
cudaFreeHost(data_h); CUERR
cudaFree(data_d); CUERR
}