-
Notifications
You must be signed in to change notification settings - Fork 4
/
cuconv.py
66 lines (49 loc) · 1.83 KB
/
cuconv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
# DEVICE SETUP
BLOCK_SIZE = 32 # Max 32. 32**2 = 1024, max for GTX1060
# Compile kernel
mod = SourceModule(open("kernel.cu", "r").read())
# Get functions
conv = mod.get_function("conv")
def convolve(a, b):
global BLOCK_SIZE
global conv
a, b = [np.array(i).astype(np.float32) for i in [a, b]]
# Matrix A
aw = np.int32(a.shape[1]) # Widthof in matrix
ah = np.int32(a.shape[0]) # Height of in matrix
# Matrix B (kernel)
bw = np.int32(b.shape[1]) # Widthof in matrix
if bw % 2 == 0:
print("Kernel width is not an odd number! Strange things will happen...")
bh = np.int32(b.shape[0]) # Height of in matrix
if bh % 2 == 0:
print("Kernel height is not an odd number! Strange things will happen...")
b_sum = np.int32(np.absolute(b).sum())
# Matrix C, subtract 2*padding, *2 because it's taken off all sides
c = np.empty([ah-(bh-1), aw-(bw-1)])
c = c.astype(np.float32)
# Allocate memory on device
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)
# Copy matrix to memory
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
# Set grid size from A matrix
grid = (int(aw/BLOCK_SIZE+(0 if aw % BLOCK_SIZE is 0 else 1)),
int(ah/BLOCK_SIZE+(0 if ah % BLOCK_SIZE is 0 else 1)),
1)
# Call gpu function
conv(a_gpu, b_gpu, aw, ah, bw, bh, b_sum, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1), grid=grid)
# Copy back the result
cuda.memcpy_dtoh(c, c_gpu)
# Free memory. May not be useful? Ask about this.
a_gpu.free()
b_gpu.free()
c_gpu.free()
# Return the result
return c