forked from BachiLi/redner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
envmap.cpp
111 lines (108 loc) · 5.75 KB
/
envmap.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#include "envmap.h"
#include "scene.h"
#include "parallel.h"
struct envmap_accumulator {
DEVICE
inline void operator()(int idx) {
const auto &d_tex = d_envmap_vals[idx];
auto xi = d_tex.xi;
auto yi = d_tex.yi;
auto texels = d_envmap->values.texels;
if (xi < 0) {
texels[0] += d_tex.t000[0];
texels[1] += d_tex.t000[1];
texels[2] += d_tex.t000[2];
} else {
auto w = d_envmap->values.width;
auto h = d_envmap->values.height;
auto num_levels = d_envmap->values.num_levels;
auto xi0 = xi;
auto xi1 = modulo(xi + 1, w);
auto yi0 = yi;
auto yi1 = modulo(yi + 1, h);
auto level = d_tex.li;
if (d_tex.li == -1) {
level = 0;
}
auto lower_texels = texels + level * w * h * 3;
#ifdef __CUDA_ARCH__
// Different DTexture may overlap, so we need to use atomic updates
// The probability of collision should be small in SIMD regime though
atomic_add(lower_texels[3 * (yi0 * w + xi0) + 0], d_tex.t000[0]);
atomic_add(lower_texels[3 * (yi0 * w + xi0) + 1], d_tex.t000[1]);
atomic_add(lower_texels[3 * (yi0 * w + xi0) + 2], d_tex.t000[2]);
atomic_add(lower_texels[3 * (yi0 * w + xi1) + 0], d_tex.t100[0]);
atomic_add(lower_texels[3 * (yi0 * w + xi1) + 1], d_tex.t100[1]);
atomic_add(lower_texels[3 * (yi0 * w + xi1) + 2], d_tex.t100[2]);
atomic_add(lower_texels[3 * (yi1 * w + xi0) + 0], d_tex.t010[0]);
atomic_add(lower_texels[3 * (yi1 * w + xi0) + 1], d_tex.t010[1]);
atomic_add(lower_texels[3 * (yi1 * w + xi0) + 2], d_tex.t010[2]);
atomic_add(lower_texels[3 * (yi1 * w + xi1) + 0], d_tex.t110[0]);
atomic_add(lower_texels[3 * (yi1 * w + xi1) + 1], d_tex.t110[1]);
atomic_add(lower_texels[3 * (yi1 * w + xi1) + 2], d_tex.t110[2]);
if (d_tex.li >= 0 && d_tex.li < num_levels) {
auto higher_texels = texels + (level + 1) * w * h * 3;
atomic_add(higher_texels[3 * (yi0 * w + xi0) + 0], d_tex.t001[0]);
atomic_add(higher_texels[3 * (yi0 * w + xi0) + 1], d_tex.t001[1]);
atomic_add(higher_texels[3 * (yi0 * w + xi0) + 2], d_tex.t001[2]);
atomic_add(higher_texels[3 * (yi0 * w + xi1) + 0], d_tex.t101[0]);
atomic_add(higher_texels[3 * (yi0 * w + xi1) + 1], d_tex.t101[1]);
atomic_add(higher_texels[3 * (yi0 * w + xi1) + 2], d_tex.t101[2]);
atomic_add(higher_texels[3 * (yi1 * w + xi0) + 0], d_tex.t011[0]);
atomic_add(higher_texels[3 * (yi1 * w + xi0) + 1], d_tex.t011[1]);
atomic_add(higher_texels[3 * (yi1 * w + xi0) + 2], d_tex.t011[2]);
atomic_add(higher_texels[3 * (yi1 * w + xi1) + 0], d_tex.t111[0]);
atomic_add(higher_texels[3 * (yi1 * w + xi1) + 1], d_tex.t111[1]);
atomic_add(higher_texels[3 * (yi1 * w + xi1) + 2], d_tex.t111[2]);
}
#else
// Lock the environment map. Slow but probably not bottleneck.
std::unique_lock<std::mutex> guard(((std::mutex*)envmap_mutex)[0]);
lower_texels[3 * (yi0 * w + xi0) + 0] += d_tex.t000[0];
lower_texels[3 * (yi0 * w + xi0) + 1] += d_tex.t000[1];
lower_texels[3 * (yi0 * w + xi0) + 2] += d_tex.t000[2];
lower_texels[3 * (yi0 * w + xi1) + 0] += d_tex.t100[0];
lower_texels[3 * (yi0 * w + xi1) + 1] += d_tex.t100[1];
lower_texels[3 * (yi0 * w + xi1) + 2] += d_tex.t100[2];
lower_texels[3 * (yi1 * w + xi0) + 0] += d_tex.t010[0];
lower_texels[3 * (yi1 * w + xi0) + 1] += d_tex.t010[1];
lower_texels[3 * (yi1 * w + xi0) + 2] += d_tex.t010[2];
lower_texels[3 * (yi1 * w + xi1) + 0] += d_tex.t110[0];
lower_texels[3 * (yi1 * w + xi1) + 1] += d_tex.t110[1];
lower_texels[3 * (yi1 * w + xi1) + 2] += d_tex.t110[2];
if (d_tex.li >= 0 && d_tex.li < num_levels - 1) {
auto higher_texels = texels + (level + 1) * w * h * 3;
higher_texels[3 * (yi0 * w + xi0) + 0] += d_tex.t001[0];
higher_texels[3 * (yi0 * w + xi0) + 1] += d_tex.t001[1];
higher_texels[3 * (yi0 * w + xi0) + 2] += d_tex.t001[2];
higher_texels[3 * (yi0 * w + xi1) + 0] += d_tex.t101[0];
higher_texels[3 * (yi0 * w + xi1) + 1] += d_tex.t101[1];
higher_texels[3 * (yi0 * w + xi1) + 2] += d_tex.t101[2];
higher_texels[3 * (yi1 * w + xi0) + 0] += d_tex.t011[0];
higher_texels[3 * (yi1 * w + xi0) + 1] += d_tex.t011[1];
higher_texels[3 * (yi1 * w + xi0) + 2] += d_tex.t011[2];
higher_texels[3 * (yi1 * w + xi1) + 0] += d_tex.t111[0];
higher_texels[3 * (yi1 * w + xi1) + 1] += d_tex.t111[1];
higher_texels[3 * (yi1 * w + xi1) + 2] += d_tex.t111[2];
}
#endif
}
}
const DTexture3 *d_envmap_vals;
DEnvironmentMap *d_envmap;
void *envmap_mutex; // CUDA doesn't recognize std::mutex
};
void accumulate_envmap(const Scene &scene,
const BufferView<DTexture3> &d_envmap_vals,
const Matrix4x4 &d_world_to_env,
DEnvironmentMap &d_envmap,
bool use_gpu) {
parallel_for(envmap_accumulator{
d_envmap_vals.begin(), &d_envmap, (void*)&scene.envmap_mutex},
d_envmap_vals.size(), scene.use_gpu);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
d_envmap.world_to_env[4 * i + j] += d_world_to_env(i, j);
}
}
}