From 68585bb32e885e037f6c74d9a622d637f7479a00 Mon Sep 17 00:00:00 2001 From: Zafar Date: Wed, 1 Jul 2020 15:30:36 -0700 Subject: [PATCH 1/2] [1] Add the pip requirements --- requirements.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d7169ba --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +torch==0.4.0 +torchvision>=0.6 +scikit-umfpack +setuptools +cupy-cuda101 +pynvrtc +opencv-contrib-python From c10e404c649d7dcc6175c1b3ca8f610bb521d5da Mon Sep 17 00:00:00 2001 From: Zafar Date: Wed, 1 Jul 2020 15:31:03 -0700 Subject: [PATCH 2/2] [2] Update to the latest pytorch --- .gitignore | 4 + .gitmodules | 3 + demo_example3.sh | 9 + demo_with_ade20k_ssn.py | 3 +- photo_smooth.py | 19 +- requirements.txt | 2 +- smooth_filter.py | 635 ++++++++++++++++++++-------------------- 7 files changed, 353 insertions(+), 322 deletions(-) create mode 100644 .gitmodules diff --git a/.gitignore b/.gitignore index 67119cc..ae216a3 100755 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ notebooks/.ipynb_checkpoints/* *.zip *.pkl *.pyc + +# Add the images explicitly +*.jpg +*.png diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3095bbf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "segmentation"] + path = segmentation + url = https://github.com/mingyuliutw/semantic-segmentation-pytorch diff --git a/demo_example3.sh b/demo_example3.sh index 8d19836..28aa082 100755 --- a/demo_example3.sh +++ b/demo_example3.sh @@ -1,3 +1,12 @@ +# The model paths are coming from the demo defined here: +# https://github.com/mingyuliutw/semantic-segmentation-pytorch +git submodule update --init --recursive + +pushd segmentation +echo -e "\033[33m[WARN]\033[0m Segmentation repo might have dependency on the older version of Pytorch" +./demo_test.sh +popd + mkdir images -p && mkdir results -p; rm images/content3.png -rf; rm images/style3.png -rf; diff --git a/demo_with_ade20k_ssn.py b/demo_with_ade20k_ssn.py index c3a96aa..bebebbd 100644 --- a/demo_with_ade20k_ssn.py +++ b/demo_with_ade20k_ssn.py @@ -13,7 +13,8 @@ from segmentation.models import ModelBuilder, SegmentationModule from lib.nn import user_scattered_collate, async_copy_to from lib.utils import as_numpy, mark_volatile -from scipy.misc import imread, imresize +# from scipy.misc import imread, imresize +from imageio import imread import cv2 from torchvision import transforms import numpy as np diff --git a/photo_smooth.py b/photo_smooth.py index e1d1bd7..d8921f4 100644 --- a/photo_smooth.py +++ b/photo_smooth.py @@ -4,7 +4,7 @@ """ from __future__ import division import torch.nn as nn -import scipy.misc +# import scipy.misc import numpy as np import scipy.sparse import scipy.sparse.linalg @@ -20,13 +20,15 @@ def __init__(self, beta=0.9999): def process(self, initImg, contentImg): if type(contentImg) == str: - content = scipy.misc.imread(contentImg, mode='RGB') + # content = scipy.misc.imread(contentImg, mode='RGB') + content = imageio.imread(contentImg, format='RGB') else: content = contentImg.copy() # content = scipy.misc.imread(contentImg, mode='RGB') if type(initImg) == str: - B = scipy.misc.imread(initImg, mode='RGB').astype(np.float64) / 255 + # B = scipy.misc.imread(initImg, mode='RGB').astype(np.float64) / 255 + B = imageio.imread(initImg, mode='RGB').astype(np.float64) / 255 else: B = scipy.asarray(initImg).astype(np.float64) / 255 # B = self. @@ -35,7 +37,12 @@ def process(self, initImg, contentImg): h = h1 - 4 w = w1 - 4 B = B[int((h1-h)/2):int((h1-h)/2+h),int((w1-w)/2):int((w1-w)/2+w),:] - content = scipy.misc.imresize(content,(h,w)) + # content = scipy.misc.imresize(content,(h,w)) + if isinstance(content, Image.Image): + content = content.resize((h, w)) + else: + content = Image.fromarray(content).resize((h, w)) + content = np.array(content) B = self.__replication_padding(B,2) content = self.__replication_padding(content,2) content = content.astype(np.float64)/255 @@ -57,7 +64,7 @@ def process(self, initImg, contentImg): V = V*(1-self.beta) V = V.reshape(h1,w1,k) V = V[2:2+h,2:2+w,:] - + img = Image.fromarray(np.uint8(np.clip(V * 255., 0, 255.))) return img @@ -96,4 +103,4 @@ def __replication_padding(self, arr,pad): def __rolling_block(self, A, block=(3, 3)): shape = (A.shape[0] - block[0] + 1, A.shape[1] - block[1] + 1) + block strides = (A.strides[0], A.strides[1]) + A.strides - return as_strided(A, shape=shape, strides=strides) \ No newline at end of file + return as_strided(A, shape=shape, strides=strides) diff --git a/requirements.txt b/requirements.txt index d7169ba..ef8ab13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -torch==0.4.0 +torch>=1.5.1 torchvision>=0.6 scikit-umfpack setuptools diff --git a/smooth_filter.py b/smooth_filter.py index 3aeab4b..772bad1 100644 --- a/smooth_filter.py +++ b/smooth_filter.py @@ -3,337 +3,344 @@ Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). """ src = ''' - #include "/usr/local/cuda/include/math_functions.h" - #define TB 256 - #define EPS 1e-7 - - __device__ bool InverseMat4x4(double m_in[4][4], double inv_out[4][4]) { - double m[16], inv[16]; - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - m[i * 4 + j] = m_in[i][j]; - } - } - - inv[0] = m[5] * m[10] * m[15] - - m[5] * m[11] * m[14] - - m[9] * m[6] * m[15] + - m[9] * m[7] * m[14] + - m[13] * m[6] * m[11] - - m[13] * m[7] * m[10]; - - inv[4] = -m[4] * m[10] * m[15] + - m[4] * m[11] * m[14] + - m[8] * m[6] * m[15] - - m[8] * m[7] * m[14] - - m[12] * m[6] * m[11] + - m[12] * m[7] * m[10]; - - inv[8] = m[4] * m[9] * m[15] - - m[4] * m[11] * m[13] - - m[8] * m[5] * m[15] + - m[8] * m[7] * m[13] + - m[12] * m[5] * m[11] - - m[12] * m[7] * m[9]; - - inv[12] = -m[4] * m[9] * m[14] + - m[4] * m[10] * m[13] + - m[8] * m[5] * m[14] - - m[8] * m[6] * m[13] - - m[12] * m[5] * m[10] + - m[12] * m[6] * m[9]; - - inv[1] = -m[1] * m[10] * m[15] + - m[1] * m[11] * m[14] + - m[9] * m[2] * m[15] - - m[9] * m[3] * m[14] - - m[13] * m[2] * m[11] + - m[13] * m[3] * m[10]; - - inv[5] = m[0] * m[10] * m[15] - - m[0] * m[11] * m[14] - - m[8] * m[2] * m[15] + - m[8] * m[3] * m[14] + - m[12] * m[2] * m[11] - - m[12] * m[3] * m[10]; - - inv[9] = -m[0] * m[9] * m[15] + - m[0] * m[11] * m[13] + - m[8] * m[1] * m[15] - - m[8] * m[3] * m[13] - - m[12] * m[1] * m[11] + - m[12] * m[3] * m[9]; - - inv[13] = m[0] * m[9] * m[14] - - m[0] * m[10] * m[13] - - m[8] * m[1] * m[14] + - m[8] * m[2] * m[13] + - m[12] * m[1] * m[10] - - m[12] * m[2] * m[9]; - - inv[2] = m[1] * m[6] * m[15] - - m[1] * m[7] * m[14] - - m[5] * m[2] * m[15] + - m[5] * m[3] * m[14] + - m[13] * m[2] * m[7] - - m[13] * m[3] * m[6]; - - inv[6] = -m[0] * m[6] * m[15] + - m[0] * m[7] * m[14] + - m[4] * m[2] * m[15] - - m[4] * m[3] * m[14] - - m[12] * m[2] * m[7] + - m[12] * m[3] * m[6]; - - inv[10] = m[0] * m[5] * m[15] - - m[0] * m[7] * m[13] - - m[4] * m[1] * m[15] + - m[4] * m[3] * m[13] + - m[12] * m[1] * m[7] - - m[12] * m[3] * m[5]; - - inv[14] = -m[0] * m[5] * m[14] + - m[0] * m[6] * m[13] + - m[4] * m[1] * m[14] - - m[4] * m[2] * m[13] - - m[12] * m[1] * m[6] + - m[12] * m[2] * m[5]; - - inv[3] = -m[1] * m[6] * m[11] + - m[1] * m[7] * m[10] + - m[5] * m[2] * m[11] - - m[5] * m[3] * m[10] - - m[9] * m[2] * m[7] + - m[9] * m[3] * m[6]; - - inv[7] = m[0] * m[6] * m[11] - - m[0] * m[7] * m[10] - - m[4] * m[2] * m[11] + - m[4] * m[3] * m[10] + - m[8] * m[2] * m[7] - - m[8] * m[3] * m[6]; - - inv[11] = -m[0] * m[5] * m[11] + - m[0] * m[7] * m[9] + - m[4] * m[1] * m[11] - - m[4] * m[3] * m[9] - - m[8] * m[1] * m[7] + - m[8] * m[3] * m[5]; - - inv[15] = m[0] * m[5] * m[10] - - m[0] * m[6] * m[9] - - m[4] * m[1] * m[10] + - m[4] * m[2] * m[9] + - m[8] * m[1] * m[6] - - m[8] * m[2] * m[5]; - - double det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12]; - - if (abs(det) < 1e-9) { - return false; - } - - - det = 1.0 / det; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - inv_out[i][j] = inv[i * 4 + j] * det; - } - } - - return true; - } + // #include "/usr/local/cuda/include/math_functions.h" + #include + #define TB 256 + #define EPS 1e-7 + + __device__ bool InverseMat4x4(double m_in[4][4], double inv_out[4][4]) { + double m[16], inv[16]; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + m[i * 4 + j] = m_in[i][j]; + } + } + + inv[0] = m[5] * m[10] * m[15] - + m[5] * m[11] * m[14] - + m[9] * m[6] * m[15] + + m[9] * m[7] * m[14] + + m[13] * m[6] * m[11] - + m[13] * m[7] * m[10]; + + inv[4] = -m[4] * m[10] * m[15] + + m[4] * m[11] * m[14] + + m[8] * m[6] * m[15] - + m[8] * m[7] * m[14] - + m[12] * m[6] * m[11] + + m[12] * m[7] * m[10]; + + inv[8] = m[4] * m[9] * m[15] - + m[4] * m[11] * m[13] - + m[8] * m[5] * m[15] + + m[8] * m[7] * m[13] + + m[12] * m[5] * m[11] - + m[12] * m[7] * m[9]; + + inv[12] = -m[4] * m[9] * m[14] + + m[4] * m[10] * m[13] + + m[8] * m[5] * m[14] - + m[8] * m[6] * m[13] - + m[12] * m[5] * m[10] + + m[12] * m[6] * m[9]; + + inv[1] = -m[1] * m[10] * m[15] + + m[1] * m[11] * m[14] + + m[9] * m[2] * m[15] - + m[9] * m[3] * m[14] - + m[13] * m[2] * m[11] + + m[13] * m[3] * m[10]; + + inv[5] = m[0] * m[10] * m[15] - + m[0] * m[11] * m[14] - + m[8] * m[2] * m[15] + + m[8] * m[3] * m[14] + + m[12] * m[2] * m[11] - + m[12] * m[3] * m[10]; + + inv[9] = -m[0] * m[9] * m[15] + + m[0] * m[11] * m[13] + + m[8] * m[1] * m[15] - + m[8] * m[3] * m[13] - + m[12] * m[1] * m[11] + + m[12] * m[3] * m[9]; + + inv[13] = m[0] * m[9] * m[14] - + m[0] * m[10] * m[13] - + m[8] * m[1] * m[14] + + m[8] * m[2] * m[13] + + m[12] * m[1] * m[10] - + m[12] * m[2] * m[9]; + + inv[2] = m[1] * m[6] * m[15] - + m[1] * m[7] * m[14] - + m[5] * m[2] * m[15] + + m[5] * m[3] * m[14] + + m[13] * m[2] * m[7] - + m[13] * m[3] * m[6]; + + inv[6] = -m[0] * m[6] * m[15] + + m[0] * m[7] * m[14] + + m[4] * m[2] * m[15] - + m[4] * m[3] * m[14] - + m[12] * m[2] * m[7] + + m[12] * m[3] * m[6]; + + inv[10] = m[0] * m[5] * m[15] - + m[0] * m[7] * m[13] - + m[4] * m[1] * m[15] + + m[4] * m[3] * m[13] + + m[12] * m[1] * m[7] - + m[12] * m[3] * m[5]; + + inv[14] = -m[0] * m[5] * m[14] + + m[0] * m[6] * m[13] + + m[4] * m[1] * m[14] - + m[4] * m[2] * m[13] - + m[12] * m[1] * m[6] + + m[12] * m[2] * m[5]; + + inv[3] = -m[1] * m[6] * m[11] + + m[1] * m[7] * m[10] + + m[5] * m[2] * m[11] - + m[5] * m[3] * m[10] - + m[9] * m[2] * m[7] + + m[9] * m[3] * m[6]; + + inv[7] = m[0] * m[6] * m[11] - + m[0] * m[7] * m[10] - + m[4] * m[2] * m[11] + + m[4] * m[3] * m[10] + + m[8] * m[2] * m[7] - + m[8] * m[3] * m[6]; + + inv[11] = -m[0] * m[5] * m[11] + + m[0] * m[7] * m[9] + + m[4] * m[1] * m[11] - + m[4] * m[3] * m[9] - + m[8] * m[1] * m[7] + + m[8] * m[3] * m[5]; + + inv[15] = m[0] * m[5] * m[10] - + m[0] * m[6] * m[9] - + m[4] * m[1] * m[10] + + m[4] * m[2] * m[9] + + m[8] * m[1] * m[6] - + m[8] * m[2] * m[5]; + + double det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12]; + + if (abs(det) < 1e-9) { + return false; + } + + + det = 1.0 / det; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + inv_out[i][j] = inv[i * 4 + j] * det; + } + } + + return true; + } extern "C" - __global__ void best_local_affine_kernel( - float *output, float *input, float *affine_model, - int h, int w, float epsilon, int kernel_radius - ) - { - int size = h * w; - int id = blockIdx.x * blockDim.x + threadIdx.x; - - if (id < size) { - int x = id % w, y = id / w; - - double Mt_M[4][4] = {}; // 4x4 - double invMt_M[4][4] = {}; - double Mt_S[3][4] = {}; // RGB -> 1x4 - double A[3][4] = {}; - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) { - Mt_M[i][j] = 0, invMt_M[i][j] = 0; - if (i != 3) { - Mt_S[i][j] = 0, A[i][j] = 0; - if (i == j) - Mt_M[i][j] = 1e-3; - } - } - - for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { - for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { - - int xx = x + dx, yy = y + dy; - int id2 = yy * w + xx; - - if (0 <= xx && xx < w && 0 <= yy && yy < h) { - - Mt_M[0][0] += input[id2 + 2*size] * input[id2 + 2*size]; - Mt_M[0][1] += input[id2 + 2*size] * input[id2 + size]; - Mt_M[0][2] += input[id2 + 2*size] * input[id2]; - Mt_M[0][3] += input[id2 + 2*size]; - - Mt_M[1][0] += input[id2 + size] * input[id2 + 2*size]; - Mt_M[1][1] += input[id2 + size] * input[id2 + size]; - Mt_M[1][2] += input[id2 + size] * input[id2]; - Mt_M[1][3] += input[id2 + size]; - - Mt_M[2][0] += input[id2] * input[id2 + 2*size]; - Mt_M[2][1] += input[id2] * input[id2 + size]; - Mt_M[2][2] += input[id2] * input[id2]; - Mt_M[2][3] += input[id2]; - - Mt_M[3][0] += input[id2 + 2*size]; - Mt_M[3][1] += input[id2 + size]; - Mt_M[3][2] += input[id2]; - Mt_M[3][3] += 1; - - Mt_S[0][0] += input[id2 + 2*size] * output[id2 + 2*size]; - Mt_S[0][1] += input[id2 + size] * output[id2 + 2*size]; - Mt_S[0][2] += input[id2] * output[id2 + 2*size]; - Mt_S[0][3] += output[id2 + 2*size]; - - Mt_S[1][0] += input[id2 + 2*size] * output[id2 + size]; - Mt_S[1][1] += input[id2 + size] * output[id2 + size]; - Mt_S[1][2] += input[id2] * output[id2 + size]; - Mt_S[1][3] += output[id2 + size]; - - Mt_S[2][0] += input[id2 + 2*size] * output[id2]; - Mt_S[2][1] += input[id2 + size] * output[id2]; - Mt_S[2][2] += input[id2] * output[id2]; - Mt_S[2][3] += output[id2]; - } - } - } - - bool success = InverseMat4x4(Mt_M, invMt_M); - - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - for (int k = 0; k < 4; k++) { - A[i][j] += invMt_M[j][k] * Mt_S[i][k]; - } - } - } - - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - int affine_id = i * 4 + j; - affine_model[12 * id + affine_id] = A[i][j]; - } - } - } - return ; - } + __global__ void best_local_affine_kernel( + float *output, float *input, float *affine_model, + int h, int w, float epsilon, int kernel_radius + ) + { + int size = h * w; + int id = blockIdx.x * blockDim.x + threadIdx.x; + + if (id < size) { + int x = id % w, y = id / w; + + double Mt_M[4][4] = {}; // 4x4 + double invMt_M[4][4] = {}; + double Mt_S[3][4] = {}; // RGB -> 1x4 + double A[3][4] = {}; + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) { + Mt_M[i][j] = 0, invMt_M[i][j] = 0; + if (i != 3) { + Mt_S[i][j] = 0, A[i][j] = 0; + if (i == j) + Mt_M[i][j] = 1e-3; + } + } + + for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { + for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { + + int xx = x + dx, yy = y + dy; + int id2 = yy * w + xx; + + if (0 <= xx && xx < w && 0 <= yy && yy < h) { + + Mt_M[0][0] += input[id2 + 2*size] * input[id2 + 2*size]; + Mt_M[0][1] += input[id2 + 2*size] * input[id2 + size]; + Mt_M[0][2] += input[id2 + 2*size] * input[id2]; + Mt_M[0][3] += input[id2 + 2*size]; + + Mt_M[1][0] += input[id2 + size] * input[id2 + 2*size]; + Mt_M[1][1] += input[id2 + size] * input[id2 + size]; + Mt_M[1][2] += input[id2 + size] * input[id2]; + Mt_M[1][3] += input[id2 + size]; + + Mt_M[2][0] += input[id2] * input[id2 + 2*size]; + Mt_M[2][1] += input[id2] * input[id2 + size]; + Mt_M[2][2] += input[id2] * input[id2]; + Mt_M[2][3] += input[id2]; + + Mt_M[3][0] += input[id2 + 2*size]; + Mt_M[3][1] += input[id2 + size]; + Mt_M[3][2] += input[id2]; + Mt_M[3][3] += 1; + + Mt_S[0][0] += input[id2 + 2*size] * output[id2 + 2*size]; + Mt_S[0][1] += input[id2 + size] * output[id2 + 2*size]; + Mt_S[0][2] += input[id2] * output[id2 + 2*size]; + Mt_S[0][3] += output[id2 + 2*size]; + + Mt_S[1][0] += input[id2 + 2*size] * output[id2 + size]; + Mt_S[1][1] += input[id2 + size] * output[id2 + size]; + Mt_S[1][2] += input[id2] * output[id2 + size]; + Mt_S[1][3] += output[id2 + size]; + + Mt_S[2][0] += input[id2 + 2*size] * output[id2]; + Mt_S[2][1] += input[id2 + size] * output[id2]; + Mt_S[2][2] += input[id2] * output[id2]; + Mt_S[2][3] += output[id2]; + } + } + } + + bool success = InverseMat4x4(Mt_M, invMt_M); + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 4; k++) { + A[i][j] += invMt_M[j][k] * Mt_S[i][k]; + } + } + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + affine_model[12 * id + affine_id] = A[i][j]; + } + } + } + return ; + } extern "C" - __global__ void bilateral_smooth_kernel( - float *affine_model, float *filtered_affine_model, float *guide, - int h, int w, int kernel_radius, float sigma1, float sigma2 - ) - { - int id = blockIdx.x * blockDim.x + threadIdx.x; - int size = h * w; - if (id < size) { - int x = id % w; - int y = id / w; - - double sum_affine[12] = {}; - double sum_weight = 0; - for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { - for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { - int yy = y + dy, xx = x + dx; - int id2 = yy * w + xx; - if (0 <= xx && xx < w && 0 <= yy && yy < h) { - float color_diff1 = guide[yy*w + xx] - guide[y*w + x]; - float color_diff2 = guide[yy*w + xx + size] - guide[y*w + x + size]; - float color_diff3 = guide[yy*w + xx + 2*size] - guide[y*w + x + 2*size]; - float color_diff_sqr = - (color_diff1*color_diff1 + color_diff2*color_diff2 + color_diff3*color_diff3) / 3; - - float v1 = exp(-(dx * dx + dy * dy) / (2 * sigma1 * sigma1)); - float v2 = exp(-(color_diff_sqr) / (2 * sigma2 * sigma2)); - float weight = v1 * v2; - - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - int affine_id = i * 4 + j; - sum_affine[affine_id] += weight * affine_model[id2*12 + affine_id]; - } - } - sum_weight += weight; - } - } - } - - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - int affine_id = i * 4 + j; - filtered_affine_model[id*12 + affine_id] = sum_affine[affine_id] / sum_weight; - } - } - } - return ; - } + __global__ void bilateral_smooth_kernel( + float *affine_model, float *filtered_affine_model, float *guide, + int h, int w, int kernel_radius, float sigma1, float sigma2 + ) + { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size = h * w; + if (id < size) { + int x = id % w; + int y = id / w; + + double sum_affine[12] = {}; + double sum_weight = 0; + for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { + for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { + int yy = y + dy, xx = x + dx; + int id2 = yy * w + xx; + if (0 <= xx && xx < w && 0 <= yy && yy < h) { + float color_diff1 = guide[yy*w + xx] - guide[y*w + x]; + float color_diff2 = guide[yy*w + xx + size] - guide[y*w + x + size]; + float color_diff3 = guide[yy*w + xx + 2*size] - guide[y*w + x + 2*size]; + float color_diff_sqr = + (color_diff1*color_diff1 + color_diff2*color_diff2 + color_diff3*color_diff3) / 3; + + float v1 = exp(-(dx * dx + dy * dy) / (2 * sigma1 * sigma1)); + float v2 = exp(-(color_diff_sqr) / (2 * sigma2 * sigma2)); + float weight = v1 * v2; + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + sum_affine[affine_id] += weight * affine_model[id2*12 + affine_id]; + } + } + sum_weight += weight; + } + } + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + filtered_affine_model[id*12 + affine_id] = sum_affine[affine_id] / sum_weight; + } + } + } + return ; + } extern "C" - __global__ void reconstruction_best_kernel( - float *input, float *filtered_affine_model, float *filtered_best_output, - int h, int w - ) - { - int id = blockIdx.x * blockDim.x + threadIdx.x; - int size = h * w; - if (id < size) { - double out1 = - input[id + 2*size] * filtered_affine_model[id*12 + 0] + // A[0][0] + - input[id + size] * filtered_affine_model[id*12 + 1] + // A[0][1] + - input[id] * filtered_affine_model[id*12 + 2] + // A[0][2] + - filtered_affine_model[id*12 + 3]; //A[0][3]; - double out2 = - input[id + 2*size] * filtered_affine_model[id*12 + 4] + //A[1][0] + - input[id + size] * filtered_affine_model[id*12 + 5] + //A[1][1] + - input[id] * filtered_affine_model[id*12 + 6] + //A[1][2] + - filtered_affine_model[id*12 + 7]; //A[1][3]; - double out3 = - input[id + 2*size] * filtered_affine_model[id*12 + 8] + //A[2][0] + - input[id + size] * filtered_affine_model[id*12 + 9] + //A[2][1] + - input[id] * filtered_affine_model[id*12 + 10] + //A[2][2] + - filtered_affine_model[id*12 + 11]; // A[2][3]; - - filtered_best_output[id] = out1; - filtered_best_output[id + size] = out2; - filtered_best_output[id + 2*size] = out3; - } - return ; - } - ''' - + __global__ void reconstruction_best_kernel( + float *input, float *filtered_affine_model, float *filtered_best_output, + int h, int w + ) + { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size = h * w; + if (id < size) { + double out1 = + input[id + 2*size] * filtered_affine_model[id*12 + 0] + // A[0][0] + + input[id + size] * filtered_affine_model[id*12 + 1] + // A[0][1] + + input[id] * filtered_affine_model[id*12 + 2] + // A[0][2] + + filtered_affine_model[id*12 + 3]; //A[0][3]; + double out2 = + input[id + 2*size] * filtered_affine_model[id*12 + 4] + //A[1][0] + + input[id + size] * filtered_affine_model[id*12 + 5] + //A[1][1] + + input[id] * filtered_affine_model[id*12 + 6] + //A[1][2] + + filtered_affine_model[id*12 + 7]; //A[1][3]; + double out3 = + input[id + 2*size] * filtered_affine_model[id*12 + 8] + //A[2][0] + + input[id + size] * filtered_affine_model[id*12 + 9] + //A[2][1] + + input[id] * filtered_affine_model[id*12 + 10] + //A[2][2] + + filtered_affine_model[id*12 + 11]; // A[2][3]; + + filtered_best_output[id] = out1; + filtered_best_output[id + size] = out2; + filtered_best_output[id + 2*size] = out3; + } + return ; + } + ''' + +import os import torch import numpy as np from PIL import Image from cupy.cuda import function -from pynvrtc.compiler import Program +from pynvrtc.compiler import Program, ProgramException from collections import namedtuple def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e): # program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8')) # ptx = program.compile(['-I/usr/local/cuda/include'.encode('utf-8')]) - program = Program(src, 'best_local_affine_kernel.cu') - ptx = program.compile(['-I/usr/local/cuda/include']) + CUDA_PATH = os.getenv('CUDA_PATH', '/usr/local/cuda') + try: + program = Program(src, 'best_local_affine_kernel.cu') + ptx = program.compile([f'-I{CUDA_PATH}/include']) + except ProgramException as e: + print(f'Error: {e}') + raise e m = function.Module() m.load(bytes(ptx.encode()))