From be1c515a336a21d8687375bc86902ed76eea2bd7 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 15 Jul 2016 14:09:06 +0100 Subject: [PATCH 01/33] GPU chi squared gradient with respect ro Sersic parameters --- .../rime/v4/gpu/SersicChiSquaredGradient.py | 473 ++++++++++++++++++ 1 file changed, 473 insertions(+) create mode 100644 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py new file mode 100644 index 000000000..8e6bafa40 --- /dev/null +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016 Marzia Rivi, Simon Perkins +# +# This file is part of montblanc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +import numpy as np +import string + +import pycuda.driver as cuda +import pycuda.gpuarray as gpuarray +from pycuda.compiler import SourceModule + +import montblanc +import montblanc.util as mbu +from montblanc.node import Node + + +from montblanc.config import RimeSolverConfig as Options + +FLOAT_PARAMS = { + 'BLOCKDIMX': 4, # Number of channels*4 polarisations + 'BLOCKDIMY': 25, # Number of baselines + 'BLOCKDIMZ': 10, # Number of timesteps + 'maxregs': 48 # Maximum number of registers +} + +DOUBLE_PARAMS = { + 'BLOCKDIMX': 4, # Number of channels*4 polarisations + 'BLOCKDIMY': 25, # Number of baselines + 'BLOCKDIMZ': 10, # Number of timesteps + 'maxregs': 63 # Maximum number of registers +} + +KERNEL_TEMPLATE = string.Template(""" +#include +#include +#include \"math_constants.h\" +#include +#include + +#define BLOCKDIMX ${BLOCKDIMX} +#define BLOCKDIMY ${BLOCKDIMY} +#define BLOCKDIMZ ${BLOCKDIMZ} + +#define TWO_PI_OVER_C ${two_pi_over_c} + +template +class SumCohTraits {}; + +template <> +class SumCohTraits { +public: + typedef float3 UVWType; +}; + +template <> +class SumCohTraits { +public: + typedef double3 UVWType; +}; + +// Here, the definition of the +// rime_const_data struct +// is inserted into the template +// An area of constant memory +// containing an instance of this +// structure is declared. +${rime_const_data_struct} +__constant__ rime_const_data C; +#define LEXT(name) C.name.lower_extent +#define UEXT(name) C.name.upper_extent +#define DEXT(name) (C.name.upper_extent - C.name.lower_extent) + +#define NA (C.na.local_size) +#define NBL (C.nbl.local_size) +#define NCHAN (C.nchan.local_size) +#define NTIME (C.ntime.local_size) +#define NPSRC (C.npsrc.local_size) +#define NGSRC (C.ngsrc.local_size) +#define NSSRC (C.nssrc.local_size) +#define NSRC (C.nnsrc.local_size) +#define NPOL (C.npol.local_size) +#define NPOLCHAN (C.npolchan.local_size) + + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; +old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + + +template < + typename T, + bool apply_weights=false, + typename Tr=montblanc::kernel_traits, + typename Po=montblanc::kernel_policies > +__device__ +void sersic_chi_squared_gradient_impl( + typename SumCohTraits::UVWType * uvw, + typename Tr::ft * sersic_shape, + typename Tr::ft * frequency, + int * antenna1, + int * antenna2, + typename Tr::ct * jones, + uint8_t * flag, + typename Tr::ft * weight_vector, + typename Tr::ct * observed_vis, + typename Tr::ct * G_term, + typename Tr::ct * visibilities, + typename Tr::ft * X2_grad) +{ + int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x; + int BL = blockIdx.y*blockDim.y + threadIdx.y; + int TIME = blockIdx.z*blockDim.z + threadIdx.z; + + if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan)) + return; + + __shared__ struct { + typename SumCohTraits::UVWType uvw[BLOCKDIMZ][BLOCKDIMY]; + + T e1; + T e2; + T sersic_scale; + + T X2_grad_part[3]; + + T freq[BLOCKDIMX]; + + } shared; + + typename Tr::ct dev_vis[3]; + + T & U = shared.uvw[threadIdx.z][threadIdx.y].x; + T & V = shared.uvw[threadIdx.z][threadIdx.y].y; + T & W = shared.uvw[threadIdx.z][threadIdx.y].z; + + int i; + + // Figure out the antenna pairs + i = TIME*NBL + BL; + int ANT1 = antenna1[i]; + int ANT2 = antenna2[i]; + + // UVW coordinates vary by baseline and time, but not polarised channel + if(threadIdx.x == 0) + { + // UVW, calculated from u_pq = u_p - u_q + i = TIME*NA + ANT1; + shared.uvw[threadIdx.z][threadIdx.y] = uvw[i]; + + i = TIME*NA + ANT2; + typename SumCohTraits::UVWType ant2_uvw = uvw[i]; + U -= ant2_uvw.x; + V -= ant2_uvw.y; + W -= ant2_uvw.z; + } + + // Wavelength varies by channel, but not baseline and time + // TODO uses 4 times the actually required space, since + // we don't need to store a frequency per polarisation + if(threadIdx.y == 0 && threadIdx.z == 0) + { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; } + + i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN; + typename Tr::ct delta = observed_vis[i]; + delta.x -= visibilities[i].x; + delta.y -= visibilities[i].y; + + // Zero the polarisation if it is flagged + if(flag[i] > 0) + { + delta.x = 0; delta.y = 0; + } + + // Apply any necessary weighting factors + if(apply_weights) + { + T w = weight_vector[i]; + delta.x *= w; + delta.y *= w; + } + + int SRC_START = DEXT(npsrc) + DEXT(ngsrc); + int SRC_STOP = SRC_START + DEXT(nssrc); + + // Loop over Sersic Sources + for(int SRC = SRC_START; SRC < SRC_STOP; ++SRC) + { + // sersic shape only varies by source. Shape parameters + // thus apply to the entire block and we can load them with + // only the first thread. + if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + { + i = SRC - SRC_START; shared.e1 = sersic_shape[i]; + i += DEXT(nssrc); shared.e2 = sersic_shape[i]; + i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; + + shared.X2_grad_part[0] = 0.; + shared.X2_grad_part[1] = 0.; + shared.X2_grad_part[2] = 0.; + } + + __syncthreads(); + + // Create references to a + // complicated part of shared memory + const T & U = shared.uvw[threadIdx.z][threadIdx.y].x; + const T & V = shared.uvw[threadIdx.z][threadIdx.y].y; + + // sersic source in the Fourier domain + T u1 = U*(T(1.0)+shared.e1) + V*shared.e2; + T v1 = U*shared.e2 + V*(T(1.0)-shared.e1); + + T sersic_factor = T(1.0)-shared.e1*shared.e1-shared.e2*shared.e2; + T dev_e1 = u1*(U*sersic_factor + 2*shared.e1*u1); + dev_e1 += v1*(2*shared.e1*v1 - V*sersic_factor); + T dev_e2 = u1*(V*sersic_factor + 2*shared.e2*u1); + dev_e2 += v1*(U*sersic_factor + 2*shared.e2*v1); + + // pay attention to the instructions order because temp and sersic_factor variables are utilised twice!!! + T temp = T(TWO_PI_OVER_C)*shared.freq[threadIdx.x]*shared.sersic_scale; + temp /= sersic_factor; + temp *= temp; + dev_e1 *= temp/sersic_factor; + dev_e2 *= temp/sersic_factor; + + temp *= (u1*u1+v1*v1); + sersic_factor = T(1.0)+temp; + dev_e1 /= sersic_factor; + dev_e2 /= sersic_factor; + + T dev_scale = temp/(shared.sersic_scale*sersic_factor); + + // Get the complex scalars for antenna two and multiply + // in the exponent term + // Get the complex scalar for antenna one and conjugate it + i = ((SRC*NTIME + TIME)*NA + ANT1)*NPOLCHAN + POLCHAN; + typename Tr::ct ant_one = jones[i]; + sersic_factor = T(1.0) / (sersic_factor*Po::sqrt(sersic_factor)); + ant_one.x *= sersic_factor; + ant_one.y *= sersic_factor; + i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN; + typename Tr::ct ant_two = jones[i]; + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + + dev_vis[0].x = ant_one.x*dev_e1; + dev_vis[0].y = ant_one.y*dev_e1; + dev_vis[1].x = ant_one.x*dev_e2; + dev_vis[1].y = ant_one.y*dev_e2; + dev_vis[2].x = ant_one.x*dev_scale; + dev_vis[2].y = ant_one.y*dev_scale; + /* + for (int p = 0; p < 3; p++) + { + // Multiply the visibility derivative by antenna 1's g term + i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN; + typename Tr::ct ant1_g_term = G_term[i]; + montblanc::jones_multiply_4x4_in_place(ant1_g_term, dev_vis[p]); + + // Multiply the visibility by antenna 2's g term + i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN; + typename Tr::ct ant2_g_term = G_term[i]; + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant1_g_term, ant2_g_term); + dev_vis[p] = ant1_g_term; + } + */ + + // Write partial derivative with respect to sersic parameters + for (int p=0; p<3; p++) + { + dev_vis[p].x *= delta.x; + dev_vis[p].y *= delta.y; + + typename Tr::ct other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 2); + // Add polarisations 2 and 3 to 0 and 1 + if((POLCHAN & 0x3) < 2) + { + dev_vis[p].x += other.x; + dev_vis[p].y += other.y; + } + other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 1); + + // If this is the polarisation 0, add polarisation 1 + // and write out this chi squared grad term + if((POLCHAN & 0x3) == 0) + { + dev_vis[p].x += other.x; + dev_vis[p].y += other.y; + + //atomic addition to avoid concurrent access in the shared memory + dev_vis[p].x += dev_vis[p].y; + atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x); + } + } + __syncthreads(); + + //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + { + i = SRC - DEXT(npsrc) - DEXT(ngsrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[0]); + i += DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[1]); + i += DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[2]); + } + } +} + +extern "C" { + +// Macro that stamps out different kernels, depending +// on whether we're handling floats or doubles +// Arguments +// - ft: The floating point type. Should be float/double. +// - ct: The complex type. Should be float2/double2. +// - apply_weights: boolean indicating whether we're weighting our visibilities + +#define stamp_sersic_chi_squared_gradient_fn(ft, ct, uvwt, apply_weights) \ +__global__ void \ +sersic_chi_squared_gradient( \ + uvwt * uvw, \ + ft * sersic_shape, \ + ft * frequency, \ + int * antenna1, \ + int * antenna2, \ + ct * jones, \ + uint8_t * flag,\ + ft * weight_vector, \ + ct * observed_vis, \ + ct * G_term, \ + ct * visibilities, \ + ft * X2_grad) \ +{ \ + sersic_chi_squared_gradient_impl(uvw, sersic_shape, \ + frequency, antenna1, antenna2, jones, flag,\ + weight_vector, observed_vis, G_term, \ + visibilities, X2_grad); \ +} + +${stamp_function} + +} // extern "C" { +""") + +class SersicChiSquaredGradient(Node): + def __init__(self, weight_vector=False): + super(SersicChiSquaredGradient, self).__init__() + self.weight_vector = weight_vector + + def initialise(self, solver, stream=None): + slvr = solver + ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') + + # Get a property dictionary off the solver + D = slvr.template_dict() + # Include our kernel parameters + D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS) + D['rime_const_data_struct'] = slvr.const_data().string_def() + + D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \ + mbu.redistribute_threads(D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'], + npolchan, nbl, ntime) + + regs = str(FLOAT_PARAMS['maxregs'] \ + if slvr.is_float() else DOUBLE_PARAMS['maxregs']) + + # Create the signature of the call to the function stamping macro + stamp_args = ', '.join([ + 'float' if slvr.is_float() else 'double', + 'float2' if slvr.is_float() else 'double2', + 'float3' if slvr.is_float() else 'double3', + 'true' if slvr.use_weight_vector() else 'false']) + stamp_fn = ''.join(['stamp_sersic_chi_squared_gradient_fn(', stamp_args, ')']) + D['stamp_function'] = stamp_fn + + kname = 'sersic_chi_squared_gradient' + + self.mod = SourceModule( + KERNEL_TEMPLATE.substitute(**D), + options=['-lineinfo','-maxrregcount', regs], + include_dirs=[montblanc.get_source_path()], + no_extern_c=True) + + self.rime_const_data_gpu = self.mod.get_global('C') + self.kernel = self.mod.get_function(kname) + self.launch_params = self.get_launch_params(slvr, D) + + def shutdown(self, solver, stream=None): + pass + + def pre_execution(self, solver, stream=None): + pass + + def get_launch_params(self, slvr, D): + polchans_per_block = D['BLOCKDIMX'] + bl_per_block = D['BLOCKDIMY'] + times_per_block = D['BLOCKDIMZ'] + + ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') + polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block) + bl_blocks = mbu.blocks_required(nbl, bl_per_block) + time_blocks = mbu.blocks_required(ntime, times_per_block) + + return { + 'block' : (polchans_per_block, bl_per_block, times_per_block), + 'grid' : (polchan_blocks, bl_blocks, time_blocks), + } + + def execute(self, solver, stream=None): + slvr = solver + nssrc = slvr.dim_local_size('nssrc') + + if stream is not None: + cuda.memcpy_htod_async( + self.rime_const_data_gpu[0], + slvr.const_data().ndary(), + stream=stream) + else: + cuda.memcpy_htod( + self.rime_const_data_gpu[0], + slvr.const_data().ndary()) + + sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ + else slvr.sersic_shape + + if slvr.is_float(): + gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32) + else: + gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64) + + self.kernel(slvr.uvw, sersic, + slvr.frequency, slvr.antenna1, slvr.antenna2, + slvr.jones, slvr.flag, slvr.weight_vector, + slvr.observed_vis, slvr.G_term, + slvr.model_vis, gradient, + stream=stream, **self.launch_params) + + slvr.X2_grad = 6*gradient.get().reshape(3,nssrc) + + if not self.weight_vector: + slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd + + def post_execution(self, solver, stream=None): + pass From 9a5623a2eaea3d434c8378dc334cb4767157bf48 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 15 Jul 2016 14:10:30 +0100 Subject: [PATCH 02/33] CPU chi squared gradient with respect ro Sersic parameters --- montblanc/impl/rime/v4/cpu/CPUSolver.py | 202 +++++++++++++++++++++++- 1 file changed, 199 insertions(+), 3 deletions(-) diff --git a/montblanc/impl/rime/v4/cpu/CPUSolver.py b/montblanc/impl/rime/v4/cpu/CPUSolver.py index 5026d3d21..621e245bb 100644 --- a/montblanc/impl/rime/v4/cpu/CPUSolver.py +++ b/montblanc/impl/rime/v4/cpu/CPUSolver.py @@ -650,6 +650,144 @@ def compute_gekb_vis(self, ekb_vis=None): return result + + def compute_sersic_derivatives(self): + """ + Compute the partial derivative values for the sersic (exponential) sources. + Returns a (nssrc, ntime, nbl, nchan) matrix of floating point scalars. + """ + + nssrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan') + ant0, ant1 = self.ap_idx() + + # Calculate per baseline u from per antenna u + up, uq = self.uvw[:,:,0][ant0], self.uvw[:,:,0][ant1] + u = ne.evaluate('uq-up', {'up': up, 'uq': uq}) + + # Calculate per baseline v from per antenna v + vp, vq = self.uvw[:,:,1][ant0], self.uvw[:,:,1][ant1] + v = ne.evaluate('vq-vp', {'vp': vp, 'vq': vq}) + + # Calculate per baseline w from per antenna w + wp, wq = self.uvw[:,:,2][ant0], self.uvw[:,:,2][ant1] + w = ne.evaluate('wq-wp', {'wp': wp, 'wq': wq}) + + e1 = self.sersic_shape[0] + e2 = self.sersic_shape[1] + R = self.sersic_shape[2] + + # OK, try obtain the same results with the fwhm factored out! + # u1 = u*(1+e1) - v*e2 + # v1 = u*e2 + v*(1-e1) + u1 = ne.evaluate('u_1_e1 + v_e2', + { 'u_1_e1': np.outer(1+e1, u), 'v_e2' : np.outer(e2, v)}) + v1 = ne.evaluate('u_e2 + v_1_e1', + { 'u_e2' : np.outer(e2, u), 'v_1_e1' : np.outer(1-e1,v)}) + + assert u1.shape == (nssrc, ntime*nbl) + assert v1.shape == (nssrc, ntime*nbl) + + e12 = 1 - e1 * e1 - e2 * e2 + + de1 = ne.evaluate('u1*(u_e12+2*e1*u1) + v1*(2*e1*v1-v_e12)', + local_dict={ + 'u1' : u1, 'v1': v1,\ + 'u_e12': np.outer(e12, u),\ + 'v_e12': np.outer(e12, v),\ + 'e1': e1[:,np.newaxis]}).reshape(nssrc, ntime,nbl) + + de2 = ne.evaluate('u1*(v_e12+2*e2*u1) + v1*(2*e2*v1+u_e12)', + local_dict={ + 'u1' : u1, 'v1': v1,\ + 'u_e12': np.outer(e12, u),\ + 'v_e12': np.outer(e12, v),\ + 'e2': e2[:,np.newaxis]}).reshape(nssrc, ntime,nbl) + + wavenum = (self.two_pi_over_c * self.frequency) + + temp = ne.evaluate('(wavenum*R/e12)**2', + { + 'R' : R[:,np.newaxis],\ + 'e12': e12[:,np.newaxis],\ + 'wavenum': wavenum[np.newaxis,:]}).reshape(nssrc,nchan) + + sfactor = ne.evaluate('1+temp*(u1*u1+v1*v1)', + { + 'u1' : u1[:,:,np.newaxis],\ + 'v1' : v1[:,:,np.newaxis],\ + 'temp' : temp[:,np.newaxis,:]}).reshape(nssrc,ntime,nbl,nchan) + + sersic_deriv = np.empty((3,nssrc,ntime,nbl,nchan)) + + # partial derivatives with respect to e1 and e2 + sersic_deriv[0] = ne.evaluate('de1*temp/(sfactor*e12)', + { + 'de1' : de1[:,:,:,np.newaxis],\ + 'temp': temp[:,np.newaxis,np.newaxis,:],\ + 'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\ + 'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan) + + sersic_deriv[1] = ne.evaluate('de2*temp/(sfactor*e12)', + { + 'de2' : de2[:,:,:,np.newaxis],\ + 'temp': temp[:,np.newaxis,np.newaxis,:], \ + 'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\ + 'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan) + + # partial derivatives with respect to scalelength + sersic_deriv[2] = ne.evaluate('(sfactor-1)/(sfactor*R)', + { + 'R' : R[:,np.newaxis,np.newaxis,np.newaxis],\ + 'sfactor' : sfactor }).reshape(nssrc,ntime,nbl,nchan) + + return sersic_deriv + + + + + def compute_gekb_vis_grad(self, ekb_jones=None): + """ + Computes the gradient of the complex visibilities with respect to the sersic parameters. + Returns a (3,nssrc,ntime,nbl,nchan,4) matrix of complex scalars. + """ + + nssrc, npsrc, ngsrc, nsrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'npsrc', 'ngsrc', 'nsrc', 'ntime', 'nbl', 'nchan') + + if ekb_jones is None: + ekb_jones = self.compute_ekb_jones_per_bl() + + want_shape = (nsrc, ntime, nbl, nchan, 4) + assert ekb_jones.shape == want_shape, \ + 'Expected shape %s. Got %s instead.' % \ + (want_shape, ekb_jones.shape) + + ant0, ant1 = self.ap_idx(chan=True) + g_term_p = self.G_term[ant0] + g_term_q = self.G_term[ant1] + + assert g_term_p.shape == (ntime, nbl, nchan, 4) + + gekb_jones = np.empty([nssrc,ntime,nbl,nchan,4]) + for i in range(nssrc): + gekb_jones[i] = (self.jones_multiply(g_term_p, ekb_jones[i]) + .reshape(ntime, nbl, nchan, 4)) + gekb_jones[i] = (self.jones_multiply(gekb_jones[i], g_term_q, hermitian=True) + .reshape(ntime, nbl, nchan, 4)) + + sersic_deriv = self.compute_sersic_derivatives() + + src_beg = npsrc + ngsrc + src_end = npsrc + ngsrc + nssrc + vis_grad = np.empty((3,nssrc,ntime,nbl,nchan,4),dtype = np.complex) + vis_grad[0] = gekb_jones[src_beg:src_end]*sersic_deriv[0,:,:,:,:,np.newaxis] + vis_grad[1] = gekb_jones[src_beg:src_end]*sersic_deriv[1,:,:,:,:,np.newaxis] + vis_grad[2] = gekb_jones[src_beg:src_end]*sersic_deriv[2,:,:,:,:,np.newaxis] + + assert vis_grad.shape == (3, nssrc, ntime, nbl, nchan, 4) + + return vis_grad + + def compute_chi_sqrd_sum_terms(self, vis=None): """ Computes the terms of the chi squared sum, @@ -711,14 +849,72 @@ def compute_chi_sqrd(self, chi_sqrd_terms=None): return (term_sum if self.use_weight_vector() is True else term_sum / self.sigma_sqrd) + + def compute_chi_sqrd_gradient(self, vis=None, vis_grad=None): + """ Computes the chi squared gradient with respect to Sersic parameters. + Parameters: + weight_vector : boolean + True if the chi squared test + should be computed with a noise vector + Returns a (3,nssrc) matrix of real values + """ + nssrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan') + + # Do the chi squared gradient on the CPU. + # If we're not using the weight vector, sum and + # divide by the sigma squared. + # Otherwise, simply return the sum + if vis is None: vis = self.compute_gekb_vis() + if vis_grad is None: vis_grad = self.compute_gekb_vis_grad() + + + # Compute the residuals if this has not yet happened + if not self.outputs_residuals(): + d = ne.evaluate('(ovis - mvis)*where(flag > 0, 0, 1)', { + 'mvis': vis, + 'ovis': self.observed_vis, + 'flag' : self.flag }) + assert d.shape == (ntime, nbl, nchan, 4) + else: + d = vis + + re = d.real + im = d.imag + wv = self.weight_vector + + # Multiply by the weight vector if required + if self.use_weight_vector() is True: + ne.evaluate('re*wv', {'re': re, 'wv': wv}, out=re) + ne.evaluate('im*wv', {'im': im, 'wv': wv}, out=im) + + # Multiply by the visibilities gradient + for p in xrange(3): + for s in xrange(nssrc): + vis_grad[p,s].real = ne.evaluate('re*dre', {'re': re, 'dre': vis_grad[p,s].real}) + vis_grad[p,s].imag = ne.evaluate('im*dim', {'im': im, 'dim': vis_grad[p,s].imag}) + + # Sum the real and imaginary terms together + # for the final result. + re_sum = ne.evaluate('sum(re,5)', {'re': vis_grad.real}) + im_sum = ne.evaluate('sum(im,5)', {'im': vis_grad.imag}) + chi_sqrd_grad_terms = ne.evaluate('re_sum + im_sum', + {'re_sum': re_sum, 'im_sum': im_sum}) + assert chi_sqrd_grad_terms.shape == (3, nssrc, ntime, nbl, nchan) + + self.X2_grad = 6*np.sum(chi_sqrd_grad_terms,(2,3,4)).reshape(3,nssrc) + + return (self.X2_grad if self.use_weight_vector() is True \ + else self.X2_grad / self.sigma_sqrd) + + def solve(self): """ Solve the RIME """ self.jones[:] = self.compute_ekb_sqrt_jones_per_ant() - self.vis[:] = self.compute_gekb_vis() + self.model_vis[:] = self.compute_gekb_vis() self.chi_sqrd_result[:] = self.compute_chi_sqrd_sum_terms( - self.vis) + self.model_vis) - self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result)) \ No newline at end of file + self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result)) From 667aec15154f2f77ada0051d004a5db3e2f540a1 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 15 Jul 2016 14:11:50 +0100 Subject: [PATCH 03/33] added chi squared gradient option --- montblanc/impl/rime/v4/RimeSolver.py | 18 ++++++++++++++---- montblanc/impl/rime/v4/config.py | 2 ++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/montblanc/impl/rime/v4/RimeSolver.py b/montblanc/impl/rime/v4/RimeSolver.py index 895e7f30e..fc3d519ad 100644 --- a/montblanc/impl/rime/v4/RimeSolver.py +++ b/montblanc/impl/rime/v4/RimeSolver.py @@ -30,14 +30,24 @@ from montblanc.impl.rime.v4.gpu.RimeBSqrt import RimeBSqrt from montblanc.impl.rime.v4.gpu.RimeEKBSqrt import RimeEKBSqrt from montblanc.impl.rime.v4.gpu.RimeSumCoherencies import RimeSumCoherencies +from montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient from montblanc.pipeline import Pipeline def get_pipeline(slvr_cfg): - return Pipeline([RimeBSqrt(), - RimeEBeam(), - RimeEKBSqrt(), - RimeSumCoherencies()]) + sersic_grad = slvr_cfg[Options.SERSIC_GRADIENT] + + if (sersic_grad): + return Pipeline([RimeBSqrt(), + RimeEBeam(), + RimeEKBSqrt(), + RimeSumCoherencies(), + SersicChiSquaredGradient()]) + else: + return Pipeline([RimeBSqrt(), + RimeEBeam(), + RimeEKBSqrt(), + RimeSumCoherencies()]) class RimeSolver(MontblancCUDASolver): """ RIME Solver Implementation """ diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py index 95f13a970..158a93fe0 100644 --- a/montblanc/impl/rime/v4/config.py +++ b/montblanc/impl/rime/v4/config.py @@ -235,4 +235,6 @@ class Classifier(Enum): classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])), ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft', classifiers=frozenset([Classifier.GPU_SCRATCH])), + ary_dict('X2_grad', (3,'nssrc',), 'ft', + classifiers=frozenset([Classifier.GPU_SCRATCH])), ] From ddc2f0a1a309de48405277ca70c0d75827ea756a Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 15 Jul 2016 14:12:35 +0100 Subject: [PATCH 04/33] added chi squared gradient option --- montblanc/impl/rime/slvr_config.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/montblanc/impl/rime/slvr_config.py b/montblanc/impl/rime/slvr_config.py index d5bae57b7..556534ed4 100644 --- a/montblanc/impl/rime/slvr_config.py +++ b/montblanc/impl/rime/slvr_config.py @@ -63,6 +63,14 @@ class RimeSolverConfig(SolverConfig): "If True, chi-squared terms are weighted with a vectorised sigma. " "If False, chi-squared terms are weighted with a single scalar sigma.") + # Enabling chi squared gradient computation with respect Sersic parameters + SERSIC_GRADIENT = 'sersic_gradient' + DEFAULT_SERSIC_GRADIENT = False + VALID_SERSIC_GRADIENT = [True, False] + SERSIC_GRADIENT_DESCRIPTION = ( + "If True, chi_squared gradient is computed with respect to the sersic parameters for each source.") + + # weight vector initialisation keyword and valid values # This SolverConfig determines whether INIT_WEIGHTS = 'init_weights' @@ -198,6 +206,13 @@ class RimeSolverConfig(SolverConfig): SolverConfig.DEFAULT: DEFAULT_E_BEAM_DEPTH, SolverConfig.REQUIRED: True }, + + SERSIC_GRADIENT: { + SolverConfig.DESCRIPTION: SERSIC_GRADIENT_DESCRIPTION, + SolverConfig.VALID: VALID_SERSIC_GRADIENT, + SolverConfig.DEFAULT: DEFAULT_SERSIC_GRADIENT, + SolverConfig.REQUIRED: True + }, } def parser(self): @@ -221,6 +236,12 @@ def parser(self): help=self.E_BEAM_DEPTH_DESCRIPTION, default=self.DEFAULT_E_BEAM_DEPTH) + p.add_argument('--{v}'.format(v=self.SERSIC_GRADIENT), + required=False, + type=bool, + help=self.SERSIC_GRADIENT_DESCRIPTION, + default=self.DEFAULT_SERSIC_GRADIENT) + p.add_argument('--{v}'.format(v=self.WEIGHT_VECTOR), required=False, type=bool, @@ -274,4 +295,4 @@ def parser(self): help=self.VERSION_DESCRIPTION, default=self.DEFAULT_VERSION) - return p \ No newline at end of file + return p From 21dc3af4d8a6a42dec1a44a2ddcbed1a14afe02a Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 18 Jul 2016 18:05:40 +0100 Subject: [PATCH 05/33] Sersic gradient example --- montblanc/examples/MS_gradient_example.py | 140 ++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 montblanc/examples/MS_gradient_example.py diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py new file mode 100644 index 000000000..c1f51f8dc --- /dev/null +++ b/montblanc/examples/MS_gradient_example.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016 Marzia Rivi +# +# This file is part of montblanc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +# Compare analytical chi squared gradient computation with respect to +# Sersic parameters with the numerical gradent computation. +# CPU and GPU solvers are also compared + +# Input: filename - Measurement Set file +# ns - number of sersic sources + +import sys +import argparse +import math +import numpy as np +import montblanc +import montblanc.util as mbu + +from montblanc.config import RimeSolverConfig as Options + +ARCS2RAD = np.pi/648000. + +#scalelength in arcsec +minscale = 0.3*ARCS2RAD +maxscale = 3.5*ARCS2RAD + +parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT') +parser.add_argument('msfile', help='Input MS filename') +parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies') + +args = parser.parse_args(sys.argv[1:]) + +# Get the RIME solver +# Enable gradient computation: sersic_gradient=True +slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile, + sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc), + init_weights=None, weight_vector=False, + sersic_gradient=True, dtype='double', version='v4') + +with montblanc.rime_solver(slvr_cfg) as slvr: + + nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan') + +# Random source coordinates in the l,m (brightness image) domain + l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype) + slvr.transfer_lm(lm) + +# Brightness matrix for sources + stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype) + I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3] + I[:] = np.ones(shape=I.shape)*70. + Q[:] = np.zeros(shape=Q.shape) + U[:] = np.zeros(shape=U.shape) + V[:] = np.zeros(shape=V.shape) + slvr.transfer_stokes(stokes) + alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime) + slvr.transfer_alpha(alpha) + + # If there are sersic sources, create their + # shape matrix and transfer it. + mod = slvr.ft(np.random.random(nssrc))*0.3 + angle = slvr.ft(np.random.random(nssrc))*2*np.pi + e1 = mod*np.sin(angle) + e2 = mod*np.cos(angle) + R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale + sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc)) + slvr.transfer_sersic_shape(sersic_shape) + print sersic_shape + + #Set visibility noise variance (muJy) + time_acc = 60 + efficiency = 0.9 + channel_bandwidth_hz = 20e6 + SEFD = 400e6 + sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency) + slvr.set_sigma_sqrd(sigma) + + + # Create observed data and upload it to the GPU + slvr.solve() + with slvr.context: + observed = slvr.retrieve_model_vis() + + noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noise = noiseR +1j*noiseI + observed = observed+noise + print 'transfer to GPU' + slvr.transfer_observed_vis(observed) + + print slvr + + slvr.solve() + dq_gpu = slvr.X2_grad + print "analytical GPU: ",dq_gpu + + # Execute the pipeline + slvr.solve() + l1 = slvr.X2 + dq = np.empty([3,nssrc]) + for i in xrange(nssrc): + e1_inc = sersic_shape.copy() + e1_inc[0,i] = e1_inc[0,i]+0.000001 + slvr.transfer_sersic_shape(e1_inc) + slvr.solve() + l2 = slvr.X2 + dq[0,i] = (l2-l1)*1e6 + e2_inc = sersic_shape.copy() + e2_inc[1,i] = e2_inc[1,i]+ 0.000001 + slvr.transfer_sersic_shape(e2_inc) + slvr.solve() + l2 = slvr.X2 + dq[1,i] = (l2-l1)*1e6 + R_inc = sersic_shape.copy() + R_inc[2,i] = R_inc[2,i]+0.00000001 + slvr.transfer_sersic_shape(R_inc) + slvr.solve() + l2 = slvr.X2 + dq[2,i] = (l2-l1)*1e8 + print "numeric",dq + + From 05433fdb66d980520b42879b08f9b6450d6600c4 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 18 Jul 2016 18:06:42 +0100 Subject: [PATCH 06/33] Sersic gradient example --- montblanc/examples/MS_gradient_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py index c1f51f8dc..041405521 100644 --- a/montblanc/examples/MS_gradient_example.py +++ b/montblanc/examples/MS_gradient_example.py @@ -20,7 +20,6 @@ # Compare analytical chi squared gradient computation with respect to # Sersic parameters with the numerical gradent computation. -# CPU and GPU solvers are also compared # Input: filename - Measurement Set file # ns - number of sersic sources From 7541c011557a6b998963fac3b962aba228151430 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 20 Jul 2016 16:56:39 +0100 Subject: [PATCH 07/33] uncommented multiplication by the G term --- montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 8e6bafa40..b017cb502 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -277,7 +277,7 @@ class SumCohTraits { dev_vis[1].y = ant_one.y*dev_e2; dev_vis[2].x = ant_one.x*dev_scale; dev_vis[2].y = ant_one.y*dev_scale; - /* + for (int p = 0; p < 3; p++) { // Multiply the visibility derivative by antenna 1's g term @@ -291,7 +291,7 @@ class SumCohTraits { montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant1_g_term, ant2_g_term); dev_vis[p] = ant1_g_term; } - */ + // Write partial derivative with respect to sersic parameters for (int p=0; p<3; p++) From b59b4e0af2abf44fb2ad8144b553cb8c53795d80 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 28 Jul 2016 16:52:10 +0100 Subject: [PATCH 08/33] added minor optimisations --- .../rime/v4/gpu/SersicChiSquaredGradient.py | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index b017cb502..1e927e19c 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -222,11 +222,9 @@ class SumCohTraits { i = SRC - SRC_START; shared.e1 = sersic_shape[i]; i += DEXT(nssrc); shared.e2 = sersic_shape[i]; i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; - - shared.X2_grad_part[0] = 0.; - shared.X2_grad_part[1] = 0.; - shared.X2_grad_part[2] = 0.; } + if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3) + shared.X2_grad_part[threadIdx.z] = 0.; __syncthreads(); @@ -277,7 +275,7 @@ class SumCohTraits { dev_vis[1].y = ant_one.y*dev_e2; dev_vis[2].x = ant_one.x*dev_scale; dev_vis[2].y = ant_one.y*dev_scale; - + /* for (int p = 0; p < 3; p++) { // Multiply the visibility derivative by antenna 1's g term @@ -292,7 +290,7 @@ class SumCohTraits { dev_vis[p] = ant1_g_term; } - + */ // Write partial derivative with respect to sersic parameters for (int p=0; p<3; p++) { @@ -323,14 +321,11 @@ class SumCohTraits { __syncthreads(); //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) - if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + // 3 different threads writes each a different component to avoid serialisation + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3) { - i = SRC - DEXT(npsrc) - DEXT(ngsrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[0]); - i += DEXT(nssrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[1]); - i += DEXT(nssrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[2]); + i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.z*DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.z]); } } } @@ -378,7 +373,7 @@ def __init__(self, weight_vector=False): def initialise(self, solver, stream=None): slvr = solver - ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') + nssrc, ntime, nbl, npolchan = slvr.dim_local_size('nssrc','ntime', 'nbl', 'npolchan') # Get a property dictionary off the solver D = slvr.template_dict() @@ -410,6 +405,11 @@ def initialise(self, solver, stream=None): include_dirs=[montblanc.get_source_path()], no_extern_c=True) + if slvr.is_float(): + self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32) + else: + self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64) + self.rime_const_data_gpu = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) @@ -451,20 +451,17 @@ def execute(self, solver, stream=None): sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape - - if slvr.is_float(): - gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32) - else: - gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64) + + self.gradient.fill(0.) self.kernel(slvr.uvw, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, slvr.jones, slvr.flag, slvr.weight_vector, slvr.observed_vis, slvr.G_term, - slvr.model_vis, gradient, + slvr.model_vis, self.gradient, stream=stream, **self.launch_params) - slvr.X2_grad = 6*gradient.get().reshape(3,nssrc) + slvr.X2_grad = 6*((self.gradient).get().reshape(3,nssrc)) if not self.weight_vector: slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd From 6ad91b90c0b1704d19a071a29bb3f1cd01abd3b6 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 1 Aug 2016 11:56:39 +0100 Subject: [PATCH 09/33] move gradient elements product by 6 in the kernel --- montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 1e927e19c..a4bb3b007 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -315,6 +315,7 @@ class SumCohTraits { //atomic addition to avoid concurrent access in the shared memory dev_vis[p].x += dev_vis[p].y; + dev_vis[p].x *= 6; atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x); } } @@ -322,10 +323,10 @@ class SumCohTraits { //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) // 3 different threads writes each a different component to avoid serialisation - if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3) + if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) { - i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.z*DEXT(nssrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.z]); + i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.x*DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); } } } @@ -461,7 +462,7 @@ def execute(self, solver, stream=None): slvr.model_vis, self.gradient, stream=stream, **self.launch_params) - slvr.X2_grad = 6*((self.gradient).get().reshape(3,nssrc)) + slvr.X2_grad = (self.gradient).get().reshape(3,nssrc) if not self.weight_vector: slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd From d083bd57d53ca550342d7ac6003b32986ff9f817 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 1 Aug 2016 14:47:50 +0100 Subject: [PATCH 10/33] changed X2_grad classifier, now SIMULATOR_OUTPUT --- montblanc/impl/rime/v4/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py index 158a93fe0..edf14a349 100644 --- a/montblanc/impl/rime/v4/config.py +++ b/montblanc/impl/rime/v4/config.py @@ -236,5 +236,5 @@ class Classifier(Enum): ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft', classifiers=frozenset([Classifier.GPU_SCRATCH])), ary_dict('X2_grad', (3,'nssrc',), 'ft', - classifiers=frozenset([Classifier.GPU_SCRATCH])), + classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])), ] From 32734c20ddb47c13ab22e409669d1660f4c392a8 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 1 Aug 2016 15:46:39 +0100 Subject: [PATCH 11/33] minor changes --- montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index a4bb3b007..be4ae2fd2 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -407,9 +407,9 @@ def initialise(self, solver, stream=None): no_extern_c=True) if slvr.is_float(): - self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32) + self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32) else: - self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64) + self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64) self.rime_const_data_gpu = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) @@ -438,7 +438,6 @@ def get_launch_params(self, slvr, D): def execute(self, solver, stream=None): slvr = solver - nssrc = slvr.dim_local_size('nssrc') if stream is not None: cuda.memcpy_htod_async( @@ -462,7 +461,7 @@ def execute(self, solver, stream=None): slvr.model_vis, self.gradient, stream=stream, **self.launch_params) - slvr.X2_grad = (self.gradient).get().reshape(3,nssrc) + slvr.X2_grad = self.gradient.get() if not self.weight_vector: slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd From 30f1001f85331b6f6b366cd72a7add373b8954d4 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Tue, 2 Aug 2016 12:00:03 +0100 Subject: [PATCH 12/33] minor changes --- .../MS_single_gpu_gradient_example.py | 134 ++++++++++++++++++ .../rime/v4/gpu/SersicChiSquaredGradient.py | 6 +- 2 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 montblanc/examples/MS_single_gpu_gradient_example.py diff --git a/montblanc/examples/MS_single_gpu_gradient_example.py b/montblanc/examples/MS_single_gpu_gradient_example.py new file mode 100644 index 000000000..fe3a61e4d --- /dev/null +++ b/montblanc/examples/MS_single_gpu_gradient_example.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016 Marzia Rivi +# +# This file is part of montblanc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +# Compare analytical chi squared gradient computation with respect to +# Sersic parameters with the numerical gradent computation. + +# Input: filename - Measurement Set file +# ns - number of sersic sources + +import sys +import argparse +import math +import numpy as np +import montblanc +import montblanc.util as mbu + +from montblanc.config import RimeSolverConfig as Options + +ARCS2RAD = np.pi/648000. + +#scalelength in arcsec +minscale = 0.3*ARCS2RAD +maxscale = 3.5*ARCS2RAD + +parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT') +parser.add_argument('msfile', help='Input MS filename') +parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies') + +args = parser.parse_args(sys.argv[1:]) + +# Get the RIME solver +# Enable gradient computation: sersic_gradient=True +slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile, + sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc), + init_weights=None, weight_vector=False, + sersic_gradient=True, dtype='double', version='v4') + +with montblanc.rime_solver(slvr_cfg) as slvr: + + nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan') + np.random.seed(14352) + + # Random source coordinates in the l,m (brightness image) domain + l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype) + slvr.transfer_lm(lm) + + # Brightness matrix for sources + stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype) + I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3] + I[:] = np.ones(shape=I.shape)*70. + Q[:] = np.zeros(shape=Q.shape) + U[:] = np.zeros(shape=U.shape) + V[:] = np.zeros(shape=V.shape) + slvr.transfer_stokes(stokes) + alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime) + slvr.transfer_alpha(alpha) + + # If there are sersic sources, create their + # shape matrix and transfer it. + mod = slvr.ft(np.random.random(nssrc))*0.3 + angle = slvr.ft(np.random.random(nssrc))*2*np.pi + e1 = mod*np.sin(angle) + e2 = mod*np.cos(angle) + R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale + sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc)) + slvr.transfer_sersic_shape(sersic_shape) + print sersic_shape + + #Set visibility noise variance (muJy) + time_acc = 60 + efficiency = 0.9 + channel_bandwidth_hz = 20e6 + SEFD = 400e6 + sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency) + slvr.set_sigma_sqrd(sigma) + + # Create observed data and upload it to the GPU + slvr.solve() + with slvr.context: + observed = slvr.retrieve_model_vis() + + noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noise = noiseR +1j*noiseI + observed = observed+noise + slvr.transfer_observed_vis(observed) + + slvr.solve() + + print slvr + print "analytical: ",slvr.X2_grad + + # Execute the pipeline + l1 = slvr.X2 + dq = np.empty([3,nssrc]) + for i in xrange(nssrc): + e1_inc = sersic_shape.copy() + e1_inc[0,i] = e1_inc[0,i]+0.000001 + slvr.transfer_sersic_shape(e1_inc) + slvr.solve() + l2 = slvr.X2 + dq[0,i] = (l2-l1)*1e6 + e2_inc = sersic_shape.copy() + e2_inc[1,i] = e2_inc[1,i]+ 0.000001 + slvr.transfer_sersic_shape(e2_inc) + slvr.solve() + l2 = slvr.X2 + dq[1,i] = (l2-l1)*1e6 + R_inc = sersic_shape.copy() + R_inc[2,i] = R_inc[2,i]+0.00000001 + slvr.transfer_sersic_shape(R_inc) + slvr.solve() + l2 = slvr.X2 + dq[2,i] = (l2-l1)*1e8 + print "numeric: ",dq diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index be4ae2fd2..33e9061be 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -411,7 +411,7 @@ def initialise(self, solver, stream=None): else: self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64) - self.rime_const_data_gpu = self.mod.get_global('C') + self.rime_const_data = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) @@ -441,12 +441,12 @@ def execute(self, solver, stream=None): if stream is not None: cuda.memcpy_htod_async( - self.rime_const_data_gpu[0], + self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( - self.rime_const_data_gpu[0], + self.rime_const_data[0], slvr.const_data().ndary()) sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ From e0d046700614badded753bbb9f19a082e4f8f004 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Tue, 2 Aug 2016 14:46:16 +0100 Subject: [PATCH 13/33] fixed bug #152 as in the master merge #156 --- montblanc/examples/MS_gradient_example.py | 139 ------------------ montblanc/impl/rime/v5/CompositeRimeSolver.py | 96 ++++++++++-- 2 files changed, 80 insertions(+), 155 deletions(-) delete mode 100644 montblanc/examples/MS_gradient_example.py diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py deleted file mode 100644 index 041405521..000000000 --- a/montblanc/examples/MS_gradient_example.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2016 Marzia Rivi -# -# This file is part of montblanc. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see . - -# Compare analytical chi squared gradient computation with respect to -# Sersic parameters with the numerical gradent computation. - -# Input: filename - Measurement Set file -# ns - number of sersic sources - -import sys -import argparse -import math -import numpy as np -import montblanc -import montblanc.util as mbu - -from montblanc.config import RimeSolverConfig as Options - -ARCS2RAD = np.pi/648000. - -#scalelength in arcsec -minscale = 0.3*ARCS2RAD -maxscale = 3.5*ARCS2RAD - -parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT') -parser.add_argument('msfile', help='Input MS filename') -parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies') - -args = parser.parse_args(sys.argv[1:]) - -# Get the RIME solver -# Enable gradient computation: sersic_gradient=True -slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile, - sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc), - init_weights=None, weight_vector=False, - sersic_gradient=True, dtype='double', version='v4') - -with montblanc.rime_solver(slvr_cfg) as slvr: - - nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan') - -# Random source coordinates in the l,m (brightness image) domain - l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) - m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) - lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype) - slvr.transfer_lm(lm) - -# Brightness matrix for sources - stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype) - I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3] - I[:] = np.ones(shape=I.shape)*70. - Q[:] = np.zeros(shape=Q.shape) - U[:] = np.zeros(shape=U.shape) - V[:] = np.zeros(shape=V.shape) - slvr.transfer_stokes(stokes) - alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime) - slvr.transfer_alpha(alpha) - - # If there are sersic sources, create their - # shape matrix and transfer it. - mod = slvr.ft(np.random.random(nssrc))*0.3 - angle = slvr.ft(np.random.random(nssrc))*2*np.pi - e1 = mod*np.sin(angle) - e2 = mod*np.cos(angle) - R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale - sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc)) - slvr.transfer_sersic_shape(sersic_shape) - print sersic_shape - - #Set visibility noise variance (muJy) - time_acc = 60 - efficiency = 0.9 - channel_bandwidth_hz = 20e6 - SEFD = 400e6 - sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency) - slvr.set_sigma_sqrd(sigma) - - - # Create observed data and upload it to the GPU - slvr.solve() - with slvr.context: - observed = slvr.retrieve_model_vis() - - noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape) - noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape) - noise = noiseR +1j*noiseI - observed = observed+noise - print 'transfer to GPU' - slvr.transfer_observed_vis(observed) - - print slvr - - slvr.solve() - dq_gpu = slvr.X2_grad - print "analytical GPU: ",dq_gpu - - # Execute the pipeline - slvr.solve() - l1 = slvr.X2 - dq = np.empty([3,nssrc]) - for i in xrange(nssrc): - e1_inc = sersic_shape.copy() - e1_inc[0,i] = e1_inc[0,i]+0.000001 - slvr.transfer_sersic_shape(e1_inc) - slvr.solve() - l2 = slvr.X2 - dq[0,i] = (l2-l1)*1e6 - e2_inc = sersic_shape.copy() - e2_inc[1,i] = e2_inc[1,i]+ 0.000001 - slvr.transfer_sersic_shape(e2_inc) - slvr.solve() - l2 = slvr.X2 - dq[1,i] = (l2-l1)*1e6 - R_inc = sersic_shape.copy() - R_inc[2,i] = R_inc[2,i]+0.00000001 - slvr.transfer_sersic_shape(R_inc) - slvr.solve() - l2 = slvr.X2 - dq[2,i] = (l2-l1)*1e8 - print "numeric",dq - - diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 3ff158978..f859d5c8c 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -126,7 +126,9 @@ def __init__(self, slvr_cfg): # before throttling is applied self.throttle_factor = slvr_cfg.get( Options.VISIBILITY_THROTTLE_FACTOR) - + # Enabling chi squared gradient computation + self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT) + # Massage the contexts for each device into a list if not isinstance(self.dev_ctxs, list): self.dev_ctxs = [self.dev_ctxs] @@ -926,6 +928,31 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): subslvr, kernel.rime_const_data[0])) kernel.execute(subslvr, subslvr.stream) + # Iterate again for gradient computation requiring visibilities computed + # for all source chuncks + if self.enable_sersic_grad: + for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices(): + # Update our maps with source slice information + cpu_slice_map.update(src_cpu_slice_map) + gpu_slice_map.update(src_gpu_slice_map) + + # Configure dimension extents and global size on the sub-solver + for name, slice_ in cpu_slice_map.iteritems(): + subslvr.update_dimension(name=name, + global_size=self.dimension(name).global_size, + lower_extent=slice_.start, + upper_extent=slice_.stop) + + # Enqueue Sersic gradient + kernel = subslvr.rime_sersic_gradient + pool_refs.extend(self._enqueue_array( + subslvr, cpu_slice_map, gpu_slice_map, + direction=ASYNC_HTOD, dirty=dirty, + classifiers=[Classifier.COHERENCIES_INPUT])) + pool_refs.append(self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0])) + kernel.execute(subslvr, subslvr.stream) + # Enqueue chi-squared term reduction and return the # GPU array allocated to it X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream) @@ -943,8 +970,13 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): classifiers=[Classifier.SIMULATOR_OUTPUT]) # Should only be model visibilities - assert len(refs) == 1, ('Expected one array (model visibilities), ' - 'received {l} instead.'.format(l=len(refs))) + if self.enable_sersic_grad: + assert len(refs) == 2, ('Expected two arrays (model visibilities, X2_grad), ' + 'received {l} instead.'.format(l=len(refs))) + X2_grad = refs[1] + else: + assert len(refs) == 1, ('Expected one array (model visibilities), ' + 'received {l} instead.'.format(l=len(refs))) model_vis = refs[0] @@ -959,11 +991,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): pool_refs.append(X2_gpu_ary) pool_refs.append(sub_X2) pool_refs.append(model_vis) - - return (sync_event, sub_X2, model_vis, - pool_refs, subslvr.pool_lock, - cpu_slice_map.copy(), - gpu_slice_map.copy()) + if self.enable_sersic_grad: + pool_refs.append(X2_grad) + return (sync_event, sub_X2, model_vis, X2_grad, + pool_refs, subslvr.pool_lock, + cpu_slice_map.copy(), + gpu_slice_map.copy()) + else: + return (sync_event, sub_X2, model_vis, + pool_refs, subslvr.pool_lock, + cpu_slice_map.copy(), + gpu_slice_map.copy()) def initialise(self): """ Initialise the sub-solver """ @@ -1009,8 +1047,12 @@ def _sync_wait(future): Return a copy of the pinned chi-squared, pinned model visibilities and index into the CPU array after synchronizing on the cuda_event """ - cuda_event, pinned_X2, pinned_model_vis, \ - pool_refs, pool_lock, cpu, gpu = future.result() + if self.enable_sersic_grad: + cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\ + pool_refs, pool_lock, cpu, gpu = future.result() + else: + cuda_event, pinned_X2, pinned_model_vis, \ + pool_refs, pool_lock, cpu, gpu = future.result() try: cuda_event.synchronize() @@ -1031,12 +1073,22 @@ def _sync_wait(future): model_vis_idx = [cpu[s] if s in cpu else ALL_SLICE for s in model_vis_sshape] + # Infer proper model visibility shape, this may + # have been squeezed when enqueueing the slice + model_vis_shape = tuple(cpu[s].stop - cpu[s].start + if s in cpu else s for s in model_vis_sshape) + X2 = pinned_X2.copy() - model_vis = pinned_model_vis.copy() + model_vis = pinned_model_vis.copy().reshape(model_vis_shape) + if self.enable_sersic_grad: + X2_grad = pinned_X2_grad.copy() _free_pool_allocs(pool_refs, pool_lock) - return X2, model_vis, model_vis_idx + if self.enable_sersic_grad: + return X2, model_vis, model_vis_idx, X2_grad + else: + return X2, model_vis, model_vis_idx # For easier typing C = CompositeRimeSolver @@ -1048,6 +1100,9 @@ def _sync_wait(future): # Running sum of the chi-squared values returned in futures X2_sum = self.ft(0.0) + if self.enable_sersic_grad: + nssrc = self.dim_local_size('nssrc') + self.X2_grad = self.ft(np.zeros((3,nssrc))) # Sets of return value futures for each executor value_futures = [set() for ex in self.enqueue_executors] @@ -1118,8 +1173,12 @@ def _accumulate(model_slice, model_chunk): for s in value_futures: s.discard(f) - # Get chi-squared and model visibilities - X2, pinned_model_vis, model_vis_idx = f.result() + # Get chi-squared and model visibilities (and X2 gradient) + if self.enable_sersic_grad: + X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() + self.X2_grad += pinned_X2_grad + else: + X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 X2_sum += X2 # Write model visibilities to the numpy array @@ -1137,8 +1196,12 @@ def _accumulate(model_slice, model_chunk): # Sum remaining chi-squared for f in done: - # Get chi-squared and model visibilities - X2, pinned_model_vis, model_vis_idx = f.result() + # Get chi-squared and model visibilities (and X2 gradient) + if self.enable_sersic_grad: + X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() + self.X2_grad += pinned_X2_grad + else: + X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 X2_sum += X2 # Write model visibilities to the numpy array @@ -1150,6 +1213,7 @@ def _accumulate(model_slice, model_chunk): self.set_X2(X2_sum) else: self.set_X2(X2_sum/self.sigma_sqrd) + self.X2_grad = self.X2_grad/self.sigma_sqrd def shutdown(self): """ Shutdown the solver """ From 0761ef53e433c276cc674da30cdd1d16de884601 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 3 Aug 2016 16:55:59 +0100 Subject: [PATCH 14/33] created multi-gpu gradient example --- .../MS_single_node_gradient_example.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 montblanc/examples/MS_single_node_gradient_example.py diff --git a/montblanc/examples/MS_single_node_gradient_example.py b/montblanc/examples/MS_single_node_gradient_example.py new file mode 100644 index 000000000..41637fa76 --- /dev/null +++ b/montblanc/examples/MS_single_node_gradient_example.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016 Marzia Rivi +# +# This file is part of montblanc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +# Compare analytical chi squared gradient computation with respect to +# Sersic parameters with the numerical gradent computation. + +# Input: filename - Measurement Set file +# ns - number of sersic sources + +import sys +import argparse +import math +import numpy as np +import montblanc +import montblanc.util as mbu + +from montblanc.config import RimeSolverConfig as Options + +ARCS2RAD = np.pi/648000. + +#scalelength in arcsec +minscale = 0.3*ARCS2RAD +maxscale = 3.5*ARCS2RAD + +parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT') +parser.add_argument('msfile', help='Input MS filename') +parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies') + +args = parser.parse_args(sys.argv[1:]) + +# Get the RIME solver +# Enable gradient computation: sersic_gradient=True +slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile, + sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc), + init_weights=None, weight_vector=False, + sersic_gradient=True, dtype='double', version='v5') + +with montblanc.rime_solver(slvr_cfg) as slvr: + + nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan') + np.random.seed(14352) + + # Random source coordinates in the l,m (brightness image) domain + l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) + slvr.lm[:] = mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype) + + # Brightness matrix for sources + I, Q, U, V = slvr.stokes[:,:,0], slvr.stokes[:,:,1], slvr.stokes[:,:,2], slvr.stokes[:,:,3] + I[:] = np.ones(shape=I.shape)*70. + Q[:] = np.zeros(shape=Q.shape) + U[:] = np.zeros(shape=U.shape) + V[:] = np.zeros(shape=V.shape) + slvr.alpha[:] = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime) + + # If there are sersic sources, create their + # shape matrix and transfer it. + mod = slvr.ft(np.random.random(nssrc))*0.3 + angle = slvr.ft(np.random.random(nssrc))*2*np.pi + e1 = mod*np.sin(angle) + e2 = mod*np.cos(angle) + R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale + slvr.sersic_shape[:] = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc)) + print slvr.sersic_shape + + #Set visibility noise variance (muJy) + time_acc = 60 + efficiency = 0.9 + channel_bandwidth_hz = 20e6 + SEFD = 400e6 + sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency) + slvr.set_sigma_sqrd(sigma) + + # Create observed data and upload it to the GPU + slvr.solve() + observed = slvr.model_vis[:].copy() + + noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape) + noise = noiseR +1j*noiseI + slvr.observed_vis[:] = (observed+noise).copy() + + slvr.solve() + + print slvr + print "analytical: ",slvr.X2_grad + + # Execute the pipeline + l1 = slvr.X2 + sersic_shape = slvr.sersic_shape.copy() + dq = np.empty([3,nssrc]) + for i in xrange(nssrc): + e1_inc = sersic_shape.copy() + e1_inc[0,i] = e1_inc[0,i]+0.000001 + slvr.sersic_shape[:] = e1_inc + slvr.solve() + l2 = slvr.X2 + dq[0,i] = (l2-l1)*1e6 + e2_inc = sersic_shape.copy() + e2_inc[1,i] = e2_inc[1,i]+ 0.000001 + slvr.sersic_shape[:] = e2_inc + slvr.solve() + l2 = slvr.X2 + dq[1,i] = (l2-l1)*1e6 + R_inc = sersic_shape.copy() + R_inc[2,i] = R_inc[2,i]+0.00000001 + slvr.sersic_shape[:] = R_inc + slvr.solve() + l2 = slvr.X2 + dq[2,i] = (l2-l1)*1e8 + print "numeric: ",dq + + From 0e4e4a15356f4cdd79a84ab3e1da15e9f0ab1142 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 3 Aug 2016 17:21:02 +0100 Subject: [PATCH 15/33] add X2 gradient computation within multi-gpu solver --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 29 ++++++++++--------- montblanc/impl/rime/v5/RimeSolver.py | 12 +++++++- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index f859d5c8c..46d99fcce 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -943,15 +943,15 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): lower_extent=slice_.start, upper_extent=slice_.stop) - # Enqueue Sersic gradient - kernel = subslvr.rime_sersic_gradient - pool_refs.extend(self._enqueue_array( - subslvr, cpu_slice_map, gpu_slice_map, - direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.COHERENCIES_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) - kernel.execute(subslvr, subslvr.stream) + # Enqueue Sersic gradient + kernel = subslvr.rime_sersic_gradient + pool_refs.extend(self._enqueue_array( + subslvr, cpu_slice_map, gpu_slice_map, + direction=ASYNC_HTOD, dirty=dirty, + classifiers=[Classifier.COHERENCIES_INPUT])) + pool_refs.append(self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0])) + kernel.execute(subslvr, subslvr.stream) # Enqueue chi-squared term reduction and return the # GPU array allocated to it @@ -1102,7 +1102,7 @@ def _sync_wait(future): X2_sum = self.ft(0.0) if self.enable_sersic_grad: nssrc = self.dim_local_size('nssrc') - self.X2_grad = self.ft(np.zeros((3,nssrc))) + X2_grad = self.ft(np.zeros((3,nssrc))) # Sets of return value futures for each executor value_futures = [set() for ex in self.enqueue_executors] @@ -1141,6 +1141,9 @@ def _accumulate(model_slice, model_chunk): if nvalues > self.throttle_factor: continue + if self.enable_sersic_grad: + self.X2_grad = self.ft(np.zeros((3,nssrc))) + # Enqueue CUDA operations for solving # this visibility chunk. enqueue_future = enq_ex.submit( @@ -1176,7 +1179,7 @@ def _accumulate(model_slice, model_chunk): # Get chi-squared and model visibilities (and X2 gradient) if self.enable_sersic_grad: X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - self.X2_grad += pinned_X2_grad + X2_grad += pinned_X2_grad else: X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 @@ -1199,7 +1202,7 @@ def _accumulate(model_slice, model_chunk): # Get chi-squared and model visibilities (and X2 gradient) if self.enable_sersic_grad: X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - self.X2_grad += pinned_X2_grad + X2_grad += pinned_X2_grad else: X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 @@ -1213,7 +1216,7 @@ def _accumulate(model_slice, model_chunk): self.set_X2(X2_sum) else: self.set_X2(X2_sum/self.sigma_sqrd) - self.X2_grad = self.X2_grad/self.sigma_sqrd + self.X2_grad = X2_grad/self.sigma_sqrd def shutdown(self): """ Shutdown the solver """ diff --git a/montblanc/impl/rime/v5/RimeSolver.py b/montblanc/impl/rime/v5/RimeSolver.py index 7d7192ee0..32b41d7cf 100644 --- a/montblanc/impl/rime/v5/RimeSolver.py +++ b/montblanc/impl/rime/v5/RimeSolver.py @@ -31,6 +31,7 @@ from montblanc.impl.rime.v5.gpu.RimeEKBSqrt import RimeEKBSqrt from montblanc.impl.rime.v5.gpu.RimeSumCoherencies import RimeSumCoherencies from montblanc.impl.rime.v5.gpu.RimeReduction import RimeReduction +from montblanc.impl.rime.v5.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient from montblanc.impl.rime.v4.RimeSolver import RimeSolver as RimeSolverV4 @@ -64,11 +65,14 @@ def __init__(self, slvr_cfg): slvr_cfg[Options.E_BEAM_DEPTH], description='E cube nu depth') + self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT) + self.rime_e_beam = RimeEBeam() self.rime_b_sqrt = RimeBSqrt() self.rime_ekb_sqrt = RimeEKBSqrt() self.rime_sum = RimeSumCoherencies() self.rime_reduce = RimeReduction() + self.rime_sersic_gradient = SersicChiSquaredGradient() from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs monkey_patch_antenna_pairs(self) @@ -136,6 +140,8 @@ def initialise(self): self.rime_ekb_sqrt.initialise(self) self.rime_sum.initialise(self) self.rime_reduce.initialise(self) + if self.enable_sersic_grad: + self.rime_sersic_gradient.initialise(self) def solve(self): with self.context: @@ -144,6 +150,8 @@ def solve(self): self.rime_ekb_sqrt.execute(self) self.rime_sum.execute(self) self.rime_reduce.execute(self) + if self.enable_sersic_grad: + self.rime_sersic_gradient.execute(self) def shutdown(self): with self.context: @@ -151,4 +159,6 @@ def shutdown(self): self.rime_b_sqrt.shutdown(self) self.rime_ekb_sqrt.shutdown(self) self.rime_sum.shutdown(self) - self.rime_reduce.shutdown(self) \ No newline at end of file + self.rime_reduce.shutdown(self) + if self.enable_sersic_grad: + self.rime_sersic_gradient.shutdown(self) From 6f7e2c0c7b15715572eb17942bc48d1513b7d847 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 3 Aug 2016 17:22:02 +0100 Subject: [PATCH 16/33] add X2 gradient computation version 5 --- .../rime/v5/gpu/SersicChiSquaredGradient.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py diff --git a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py new file mode 100644 index 000000000..0d93b9062 --- /dev/null +++ b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016 Marzia Rivi +# +# This file is part of montblanc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + +import numpy as np + +import pycuda.driver as cuda +import pycuda.gpuarray as gpuarray + +import montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient + +class SersicChiSquaredGradient(montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient.SersicChiSquaredGradient): + def __init__(self): + super(SersicChiSquaredGradient, self).__init__() + def initialise(self, solver, stream=None): + super(SersicChiSquaredGradient, self).initialise(solver,stream) + def shutdown(self, solver, stream=None): + super(SersicChiSquaredGradient, self).shutdown(solver,stream) + def pre_execution(self, solver, stream=None): + super(SersicChiSquaredGradient, self).pre_execution(solver,stream) + + if stream is not None: + cuda.memcpy_htod_async( + self.rime_const_data[0], + solver.const_data().ndary(), + stream=stream) + else: + cuda.memcpy_htod( + self.rime_const_data[0], + solver.const_data().ndary()) + + def post_execution(self, solver, stream=None): + super(SersicChiSquaredGradient, self).post_execution(solver,stream) + + def execute(self, solver, stream=None): + slvr = solver + + sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ + else slvr.sersic_shape + + print slvr.X2_grad + self.kernel(slvr.uvw, sersic, + slvr.frequency, slvr.antenna1, slvr.antenna2, + slvr.jones, slvr.flag, slvr.weight_vector, + slvr.observed_vis, slvr.G_term, + slvr.model_vis, slvr.X2_grad, + stream=stream, **self.launch_params) + print slvr.X2_grad From 11b85ea258fa2dce9e4fed1850b1dbfd0707cf66 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 3 Aug 2016 20:34:27 +0100 Subject: [PATCH 17/33] removed print instructions --- montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py index 0d93b9062..70d5c0445 100644 --- a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py @@ -54,11 +54,9 @@ def execute(self, solver, stream=None): sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape - print slvr.X2_grad self.kernel(slvr.uvw, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, slvr.jones, slvr.flag, slvr.weight_vector, slvr.observed_vis, slvr.G_term, slvr.model_vis, slvr.X2_grad, stream=stream, **self.launch_params) - print slvr.X2_grad From 7a959880c0f7e8e049427ec26544dbe85d466ffa Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 3 Aug 2016 20:43:59 +0100 Subject: [PATCH 18/33] fixed misalignment --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 46d99fcce..aac4ddcb4 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -935,7 +935,7 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Update our maps with source slice information cpu_slice_map.update(src_cpu_slice_map) gpu_slice_map.update(src_gpu_slice_map) - + # Configure dimension extents and global size on the sub-solver for name, slice_ in cpu_slice_map.iteritems(): subslvr.update_dimension(name=name, @@ -943,15 +943,15 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): lower_extent=slice_.start, upper_extent=slice_.stop) - # Enqueue Sersic gradient - kernel = subslvr.rime_sersic_gradient - pool_refs.extend(self._enqueue_array( - subslvr, cpu_slice_map, gpu_slice_map, - direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.COHERENCIES_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) - kernel.execute(subslvr, subslvr.stream) + # Enqueue Sersic gradient + kernel = subslvr.rime_sersic_gradient + pool_refs.extend(self._enqueue_array( + subslvr, cpu_slice_map, gpu_slice_map, + direction=ASYNC_HTOD, dirty=dirty, + classifiers=[Classifier.COHERENCIES_INPUT])) + pool_refs.append(self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0])) + kernel.execute(subslvr, subslvr.stream) # Enqueue chi-squared term reduction and return the # GPU array allocated to it From fc486a345892e5327a8d18b58c933685e7ff3ee1 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 17 Aug 2016 14:50:35 +0100 Subject: [PATCH 19/33] bug fixed --- montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 33e9061be..2ed9cb437 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -223,8 +223,8 @@ class SumCohTraits { i += DEXT(nssrc); shared.e2 = sersic_shape[i]; i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; } - if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3) - shared.X2_grad_part[threadIdx.z] = 0.; + if(threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) + shared.X2_grad_part[threadIdx.x] = 0.; __syncthreads(); @@ -325,7 +325,7 @@ class SumCohTraits { // 3 different threads writes each a different component to avoid serialisation if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) { - i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.x*DEXT(nssrc); + i = SRC - SRC_START + threadIdx.x*NSSRC; atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); } } From 4738328ab85cb77dd2fcc2bb2feffbf7b11e9498 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 17 Aug 2016 14:52:14 +0100 Subject: [PATCH 20/33] Composite solver merged with the master --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 195 +++++++++++------- 1 file changed, 123 insertions(+), 72 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index aac4ddcb4..84abfcd80 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -18,6 +18,7 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, see . +from collections import defaultdict from copy import (copy as shallowcopy, deepcopy) import functools import itertools @@ -31,6 +32,8 @@ import pycuda.driver as cuda import pycuda.tools +import hypercube.util as hcu + import montblanc import montblanc.util as mbu @@ -63,6 +66,10 @@ ORDERING_RANK = [' or '.join(['nsrc'] + mbu.source_nr_vars()), 'ntime', ' or '.join(['nbl', 'na']), 'nchan'] +def _update_refs(pool, new_refs): + for key, value in new_refs.iteritems(): + pool[key].extend(value) + class CompositeRimeSolver(MontblancNumpySolver): """ Composite solver implementation for RIME. @@ -265,8 +272,9 @@ def _gen_source_slices(self): # Get the source slice ranges for each individual # source type, and update the CPU dictionary with them - cpu_slice.update(mbu.source_range_slices( - src, src_end, src_nr_var_counts)) + src_range_slices = mbu.source_range_slices( + src, src_end, src_nr_var_counts) + cpu_slice.update(src_range_slices) # and configure the same for GPU slices for s in src_nr_vars: @@ -449,8 +457,8 @@ def _enqueue_array_slice(self, r, subslvr, cache_idx = dirty.get(r.name, None) if cache_idx and cache_idx == cpu_idx: - montblanc.log.debug("Cache hit on {n} index {i} " - .format(n=r.name, i=cpu_idx)) + #montblanc.log.debug("Cache hit on {n} index {i} " + # .format(n=r.name, i=cpu_idx)) return None @@ -514,7 +522,7 @@ def _enqueue_array(self, subslvr, Returns a list of arrays allocated with pinned memory. Entries in this list may be None. """ - pool_refs = [] + pool_refs = defaultdict(list) if direction is None: direction = ASYNC_HTOD @@ -569,7 +577,8 @@ def _enqueue_array(self, subslvr, cpu_ary, cpu_idx, gpu_ary, tuple(gpu_idx), direction, dirty) - pool_refs.append(pinned_ary) + if pinned_ary is not None: + pool_refs[r.name].append(pinned_ary) return pool_refs @@ -715,6 +724,11 @@ def _thread_create_solvers(self, subslvr_cfg, P, nsolvers): # Mutex for guarding the memory pools pool_lock = threading.Lock() + # Dirty index, indicating the CPU index of the + # data currently on the GPU, used for avoiding + # array transfer + self.thread_local.dirty = [{} for n in range(nsolvers)] + # Configure thread local storage # Number of solvers in this thread self.thread_local.nsolvers = nsolvers @@ -781,8 +795,8 @@ def _thread_prime_memory_pools(self): nsrc = self.dim_local_size('nsrc') # Retain references to pool allocations - pinned_pool_refs = [] - device_pool_refs = [] + pinned_pool_refs = defaultdict(list) + device_pool_refs = defaultdict(list) pinned_allocated = 0 # Class of arrays that are to be transferred @@ -794,15 +808,15 @@ def _thread_prime_memory_pools(self): # Estimate number of kernels for constant data nkernels = len(classifiers) + # Detect already transferred array chunks + dirty = {} + # Get the first chunk of the visibility space cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next() for i, subslvr in enumerate(self.thread_local.solvers): # For the maximum number of visibility chunks that can be enqueued for T in range(self.throttle_factor): - # Detect already transferred array chunks - dirty = {} - # For each source batch within the visibility chunk for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices(): cpu_slice_map.update(cpu_src_slice_map) @@ -814,22 +828,23 @@ def _thread_prime_memory_pools(self): cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=classifiers) - pinned_allocated += sum([r.nbytes for r in refs if r is not None]) - pinned_pool_refs.extend(refs) + pinned_allocated += sum([r.nbytes + for l in refs.values() for r in l]) + _update_refs(pinned_pool_refs, refs) # Allocate pinned memory for constant memory transfers cdata = subslvr.const_data().ndary() for k in range(nkernels): - ref = subslvr.pinned_mem_pool.allocate( + cdata_ref = subslvr.pinned_mem_pool.allocate( shape=cdata.shape, dtype=cdata.dtype) - pinned_allocated += ref.nbytes - pinned_pool_refs.append(ref) + pinned_allocated += cdata_ref.nbytes + pinned_pool_refs['cdata'].append(cdata_ref) # Allocate device memory for arrays that need to be # allocated from a pool by PyCUDA's reduction kernels dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes) - device_pool_refs.append(dev_ary) + device_pool_refs['X2_gpu'].append(dev_ary) device = self.thread_local.context.get_device() @@ -838,12 +853,23 @@ def _thread_prime_memory_pools(self): d=device.name(), n=mbu.fmt_bytes(pinned_allocated))) # Now force return of memory to the pools - for a in pinned_pool_refs: - if a is not None: - a.base.free() + for key, array_list in pinned_pool_refs.iteritems(): + [a.base.free() for a in array_list] + + for array_list in device_pool_refs.itervalues(): + [a.free() for a in array_list] + + + def _thread_start_solving(self): + """ + Contains enqueue thread functionality that needs to + take place at the start of each solution + """ + + # Clear the dirty dictionary to force each array to be + # transferred at least once. e.g. the beam cube + [d.clear() for d in self.thread_local.dirty] - for a in device_pool_refs: - a.free() def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): """ @@ -860,17 +886,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): tl = self.thread_local i, subslvr = tl.subslvr_gen.next() - # A list of references to memory pool allocated objects + # A dictionary of references to memory pool allocated objects # ensuring that said objects remained allocated until # after compute has been performed. Returned from # this function, this object should be discarded when # reading the result of the enqueued operations. - pool_refs = [] + pool_refs = defaultdict(list) # Cache keyed by array names and contained indices - # This is used to determine if the array has already - # been transferred to the card on this batch - dirty = {} + # This is used to avoid unnecessary CPU to GPU copies + # by caching the last index of the CPU array + dirty = tl.dirty[i] # Guard pool allocations with a coarse-grained mutex with subslvr.pool_lock: @@ -890,42 +916,50 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Enqueue E Beam kernel = subslvr.rime_e_beam - pool_refs.extend(self._enqueue_array( + new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.E_BEAM_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) + classifiers=[Classifier.E_BEAM_INPUT]) + cdata_ref = self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0]) + _update_refs(pool_refs, new_refs) + _update_refs(pool_refs, {'cdata_ebeam' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue B Sqrt kernel = subslvr.rime_b_sqrt - pool_refs.extend(self._enqueue_array( + new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.B_SQRT_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) + classifiers=[Classifier.B_SQRT_INPUT]) + cdata_ref = self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0]) + _update_refs(pool_refs, new_refs) + _update_refs(pool_refs, {'cdata_bsqrt' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue EKB Sqrt kernel = subslvr.rime_ekb_sqrt - pool_refs.extend(self._enqueue_array( + new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.EKB_SQRT_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) + classifiers=[Classifier.EKB_SQRT_INPUT]) + cdata_ref = self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0]) + _update_refs(pool_refs, new_refs) + _update_refs(pool_refs, {'cdata_ekb' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue Sum Coherencies kernel = subslvr.rime_sum - pool_refs.extend(self._enqueue_array( + new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.COHERENCIES_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) + classifiers=[Classifier.COHERENCIES_INPUT]) + cdata_ref = self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0]) + _update_refs(pool_refs, new_refs) + _update_refs(pool_refs, {'cdata_coherencies' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Iterate again for gradient computation requiring visibilities computed @@ -945,12 +979,14 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Enqueue Sersic gradient kernel = subslvr.rime_sersic_gradient - pool_refs.extend(self._enqueue_array( + new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, - classifiers=[Classifier.COHERENCIES_INPUT])) - pool_refs.append(self._enqueue_const_data_htod( - subslvr, kernel.rime_const_data[0])) + classifiers=[Classifier.COHERENCIES_INPUT]) + cdata_ref = self._enqueue_const_data_htod( + subslvr, kernel.rime_const_data[0]) + _update_refs(pool_refs, new_refs) + _update_refs(pool_refs, {'cdata_gradient' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue chi-squared term reduction and return the @@ -965,20 +1001,21 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream) # Enqueue transfer of simulator output (model visibilities) to the CPU - refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map, - direction=ASYNC_DTOH, dirty=dirty, + sim_output_refs = self._enqueue_array(subslvr, + cpu_slice_map, gpu_slice_map, + direction=ASYNC_DTOH, dirty={}, classifiers=[Classifier.SIMULATOR_OUTPUT]) # Should only be model visibilities if self.enable_sersic_grad: - assert len(refs) == 2, ('Expected two arrays (model visibilities, X2_grad), ' + assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), ' 'received {l} instead.'.format(l=len(refs))) - X2_grad = refs[1] + X2_grad = sim_output_refs['X2_grad'][0] else: - assert len(refs) == 1, ('Expected one array (model visibilities), ' + assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), ' 'received {l} instead.'.format(l=len(refs))) - model_vis = refs[0] + model_vis = sim_output_refs['model_vis'][0] # Create and record an event directly after the chi-squared copy # We'll synchronise on this thread in our synchronisation executor @@ -988,11 +1025,11 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Retain references to CPU pinned and GPU device memory # until the above enqueued operations have been performed. - pool_refs.append(X2_gpu_ary) - pool_refs.append(sub_X2) - pool_refs.append(model_vis) + pool_refs['X2_gpu'].append(X2_gpu_ary) + pool_refs['X2_cpu'].append(sub_X2) + pool_refs['model_vis_output'].append(model_vis) if self.enable_sersic_grad: - pool_refs.append(X2_grad) + pool_refs['X2_grad'].append(X2_grad) return (sync_event, sub_X2, model_vis, X2_grad, pool_refs, subslvr.pool_lock, cpu_slice_map.copy(), @@ -1028,19 +1065,26 @@ def _free_pool_allocs(pool_refs, pool_lock): import pycuda.driver as cuda import pycuda.gpuarray as gpuarray + cuda_types = (cuda.PooledDeviceAllocation, cuda.PooledHostAllocation) + + debug_str_list = ['Pool de-allocations per array name',] + debug_str_list.extend('({k}, {l})'.format(k=k,l=len(v)) + for k, v in pool_refs.iteritems()) + + montblanc.log.debug(' '.join(debug_str_list)) + with pool_lock: - for ref in pool_refs: - if ref is None: - continue - elif isinstance(ref, np.ndarray): + for k, ref in ((k, r) for k, rl in pool_refs.iteritems() + for r in rl): + if isinstance(ref, np.ndarray): ref.base.free() - elif isinstance(ref, (cuda.PooledDeviceAllocation, cuda.PooledHostAllocation)): + elif isinstance(ref, cuda_types): ref.free() elif isinstance(ref, gpuarray.GPUArray): ref.gpudata.free() else: - raise TypeError("Unable to release pool allocated " - "object of type {t}.".format(t=type(ref))) + raise TypeError("Don't know how to release pool allocated " + "object '{n}'' of type {t}.".format(n=k, t=type(ref))) def _sync_wait(future): """ @@ -1058,15 +1102,14 @@ def _sync_wait(future): cuda_event.synchronize() except cuda.LogicError as e: # Format the slices nicely - for k, s in cpu.iteritems(): - cpu[k] = '[{b}, {e}]'.format(b=s.start, e=s.stop) - - for k, s in gpu.iteritems(): - gpu[k] = '[{b}, {e}]'.format(b=s.start, e=s.stop) + pretty_cpu = { k: '[{b}, {e}]'.format(b=s.start, e=s.stop) + for k, s in cpu.iteritems() } + pretty_gpu = { k: '[{b}, {e}]'.format(b=s.start, e=s.stop) + for k, s in gpu.iteritems() } import json - print 'GPU', json.dumps(gpu, indent=2) - print 'CPU', json.dumps(cpu, indent=2) + print 'GPU', json.dumps(pretty_gpu, indent=2) + print 'CPU', json.dumps(pretty_cpu, indent=2) raise e, None, sys.exc_info()[2] # Work out the CPU view in the model visibilities @@ -1093,6 +1136,12 @@ def _sync_wait(future): # For easier typing C = CompositeRimeSolver + # Perform any initialisation required by each + # thread prior to solving + for f in cf.as_completed([ex.submit(self._thread_start_solving) + for ex in self.enqueue_executors]): + f.result() + # Create an iterator that cycles through each device's # executors, also returning the device index (enumerate) ex_it = itertools.cycle(enumerate( @@ -1223,6 +1272,8 @@ def shutdown(self): def _shutdown_func(): for i, subslvr in enumerate(self.thread_local.solvers): subslvr.shutdown() + subslvr.dev_mem_pool.stop_holding() + subslvr.pinned_mem_pool.stop_holding() self._thread_shutdown() @@ -1236,7 +1287,7 @@ def _shutdown_func(): def _thread_property_setter(self, name, value): for subslvr in self.thread_local.solvers: - setter_method_name = self.setter_name(name) + setter_method_name = hcu.setter_name(name) setter_method = getattr(subslvr, setter_method_name) setter_method(value) From 3d5d068bdada467475df3026822d084af641cf68 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 18 Aug 2016 18:29:13 +0100 Subject: [PATCH 21/33] minor change --- .../impl/rime/v4/gpu/SersicChiSquaredGradient.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 2ed9cb437..5581ab3d3 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -374,7 +374,7 @@ def __init__(self, weight_vector=False): def initialise(self, solver, stream=None): slvr = solver - nssrc, ntime, nbl, npolchan = slvr.dim_local_size('nssrc','ntime', 'nbl', 'npolchan') + ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan') # Get a property dictionary off the solver D = slvr.template_dict() @@ -406,11 +406,6 @@ def initialise(self, solver, stream=None): include_dirs=[montblanc.get_source_path()], no_extern_c=True) - if slvr.is_float(): - self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32) - else: - self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64) - self.rime_const_data = self.mod.get_global('C') self.kernel = self.mod.get_function(kname) self.launch_params = self.get_launch_params(slvr, D) @@ -452,7 +447,11 @@ def execute(self, solver, stream=None): sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape - self.gradient.fill(0.) + nssrc = slvr.dim_local_size('nssrc') + if slvr.is_float(): + self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32) + else: + self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64) self.kernel(slvr.uvw, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, From 7bd82e95ab28a3890c838a2d00d0e5dc1b56c283 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 18 Aug 2016 18:30:06 +0100 Subject: [PATCH 22/33] bug fixed: X2_grad initialisation --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 84abfcd80..ec2b0a092 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -965,6 +965,7 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Iterate again for gradient computation requiring visibilities computed # for all source chuncks if self.enable_sersic_grad: + subslvr.X2_grad.fill(0, stream=subslvr.stream) for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices(): # Update our maps with source slice information cpu_slice_map.update(src_cpu_slice_map) @@ -1190,8 +1191,8 @@ def _accumulate(model_slice, model_chunk): if nvalues > self.throttle_factor: continue - if self.enable_sersic_grad: - self.X2_grad = self.ft(np.zeros((3,nssrc))) + #if self.enable_sersic_grad: + # self.X2_grad = self.ft(np.zeros((3,nssrc))) # Enqueue CUDA operations for solving # this visibility chunk. From 8e0866ee29c69a9df3e4777f296436951ee141cb Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 18 Aug 2016 19:24:41 +0100 Subject: [PATCH 23/33] minor change --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index ec2b0a092..ab6e5036b 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -1010,11 +1010,11 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Should only be model visibilities if self.enable_sersic_grad: assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), ' - 'received {l} instead.'.format(l=len(refs))) + 'received {l} instead.'.format(l=len(new_refs))) X2_grad = sim_output_refs['X2_grad'][0] else: assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), ' - 'received {l} instead.'.format(l=len(refs))) + 'received {l} instead.'.format(l=len(new_refs))) model_vis = sim_output_refs['model_vis'][0] @@ -1191,9 +1191,6 @@ def _accumulate(model_slice, model_chunk): if nvalues > self.throttle_factor: continue - #if self.enable_sersic_grad: - # self.X2_grad = self.ft(np.zeros((3,nssrc))) - # Enqueue CUDA operations for solving # this visibility chunk. enqueue_future = enq_ex.submit( @@ -1266,7 +1263,8 @@ def _accumulate(model_slice, model_chunk): self.set_X2(X2_sum) else: self.set_X2(X2_sum/self.sigma_sqrd) - self.X2_grad = X2_grad/self.sigma_sqrd + if self.enable_sersic_grad: + self.X2_grad = X2_grad/self.sigma_sqrd def shutdown(self): """ Shutdown the solver """ From 5abd1e2e87db02993aba5b9847da544ea0265183 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 19 Aug 2016 11:50:26 +0100 Subject: [PATCH 24/33] change X2_grad classifier into GPU_SCRATCH to manage the case where gradient is not enabled --- montblanc/impl/rime/v4/config.py | 2 +- montblanc/impl/rime/v5/CompositeRimeSolver.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py index edf14a349..158a93fe0 100644 --- a/montblanc/impl/rime/v4/config.py +++ b/montblanc/impl/rime/v4/config.py @@ -236,5 +236,5 @@ class Classifier(Enum): ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft', classifiers=frozenset([Classifier.GPU_SCRATCH])), ary_dict('X2_grad', (3,'nssrc',), 'ft', - classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])), + classifiers=frozenset([Classifier.GPU_SCRATCH])), ] diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index ab6e5036b..22bb34804 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -1001,6 +1001,12 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Enqueue chi-squared copy off the GPU onto the CPU X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream) + # Get pinned memory and copy from the GPU the partial X2_grad + if self.enable_sersic_grad: + X2_grad = subslvr.pinned_mem_pool.allocate( + shape=subslvr.X2_grad.shape, dtype=subslvr.X2_grad.dtype) + subslvr.X2_grad.get_async(ary=X2_grad, stream=subslvr.stream) + # Enqueue transfer of simulator output (model visibilities) to the CPU sim_output_refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map, @@ -1008,13 +1014,8 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): classifiers=[Classifier.SIMULATOR_OUTPUT]) # Should only be model visibilities - if self.enable_sersic_grad: - assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), ' - 'received {l} instead.'.format(l=len(new_refs))) - X2_grad = sim_output_refs['X2_grad'][0] - else: - assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), ' - 'received {l} instead.'.format(l=len(new_refs))) + assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), ' + 'received {l} instead.'.format(l=len(new_refs))) model_vis = sim_output_refs['model_vis'][0] From a65b50f53a1e676101150d5e04405a937a759a61 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 19 Aug 2016 16:52:15 +0100 Subject: [PATCH 25/33] Fixed X2_grad array management in case of more src chunks --- .../rime/v4/gpu/SersicChiSquaredGradient.py | 2 +- montblanc/impl/rime/v5/CompositeRimeSolver.py | 26 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 5581ab3d3..89ed7b810 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -325,7 +325,7 @@ class SumCohTraits { // 3 different threads writes each a different component to avoid serialisation if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) { - i = SRC - SRC_START + threadIdx.x*NSSRC; + i = SRC - SRC_START + threadIdx.x*DEXT(nssrc); atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); } } diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 22bb34804..8f2586b40 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -181,6 +181,7 @@ def __init__(self, slvr_cfg): key=lambda T: T[1]) P, M, mem = budgets[0] + #P[Options.NSRC] = slvr_cfg.get(Options.SOURCE_BATCH_SIZE) # Log some information about the memory budget # and dimension reduction @@ -965,7 +966,12 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Iterate again for gradient computation requiring visibilities computed # for all source chuncks if self.enable_sersic_grad: - subslvr.X2_grad.fill(0, stream=subslvr.stream) + + # Get pinned memory to hold the full X2 gradient + nssrc = self.dim_local_size('nssrc') + sub_X2_grad = subslvr.pinned_mem_pool.allocate( + shape=(3,nssrc), dtype=subslvr.X2_grad.dtype) + for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices(): # Update our maps with source slice information cpu_slice_map.update(src_cpu_slice_map) @@ -977,7 +983,9 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): global_size=self.dimension(name).global_size, lower_extent=slice_.start, upper_extent=slice_.stop) - + + subslvr.X2_grad.fill(0, stream=subslvr.stream) + # Enqueue Sersic gradient kernel = subslvr.rime_sersic_gradient new_refs = self._enqueue_array( @@ -990,6 +998,10 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): _update_refs(pool_refs, {'cdata_gradient' : [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) + # Enqueue the X2 gradient slice opy off the GPU onto the CPU + src_range = src_cpu_slice_map['nssrc'] + subslvr.X2_grad.get_async(ary=sub_X2_grad[:,src_range], stream=subslvr.stream) + # Enqueue chi-squared term reduction and return the # GPU array allocated to it X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream) @@ -1001,12 +1013,6 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): # Enqueue chi-squared copy off the GPU onto the CPU X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream) - # Get pinned memory and copy from the GPU the partial X2_grad - if self.enable_sersic_grad: - X2_grad = subslvr.pinned_mem_pool.allocate( - shape=subslvr.X2_grad.shape, dtype=subslvr.X2_grad.dtype) - subslvr.X2_grad.get_async(ary=X2_grad, stream=subslvr.stream) - # Enqueue transfer of simulator output (model visibilities) to the CPU sim_output_refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map, @@ -1031,8 +1037,8 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): pool_refs['X2_cpu'].append(sub_X2) pool_refs['model_vis_output'].append(model_vis) if self.enable_sersic_grad: - pool_refs['X2_grad'].append(X2_grad) - return (sync_event, sub_X2, model_vis, X2_grad, + pool_refs['X2_grad'].append(sub_X2_grad) + return (sync_event, sub_X2, model_vis, sub_X2_grad, pool_refs, subslvr.pool_lock, cpu_slice_map.copy(), gpu_slice_map.copy()) From 7457d38869f01436afc5b23bbe674bdb8d98fd9c Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Thu, 25 Aug 2016 10:58:51 +0200 Subject: [PATCH 26/33] Pass results via dict, instead of tuple Provides more flexibility for different result types --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 84 +++++++++++-------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 5bbb6abd7..1ef142239 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -1049,17 +1049,22 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): pool_refs['X2_gpu'].append(X2_gpu_ary) pool_refs['X2_cpu'].append(sub_X2) pool_refs['model_vis_output'].append(model_vis) + + data = { 'sync_event' : sync_event, + 'sub_X2' : sub_X2, + 'model_vis' : model_vis, + 'pool_refs' : pool_refs, + 'pool_lock' : subslvr.pool_lock, + 'cpu' : cpu_slice_map.copy(), + 'gpu' : gpu_slice_map.copy(), + } + + # Add X2 gradient data if self.enable_sersic_grad: - pool_refs['X2_grad'].append(sub_X2_grad) - return (sync_event, sub_X2, model_vis, sub_X2_grad, - pool_refs, subslvr.pool_lock, - cpu_slice_map.copy(), - gpu_slice_map.copy()) - else: - return (sync_event, sub_X2, model_vis, - pool_refs, subslvr.pool_lock, - cpu_slice_map.copy(), - gpu_slice_map.copy()) + data['sub_X2_grad'] = sub_X2_grad + data['pool_refs']['X2_grad'].append(sub_X2_grad) + + return data def initialise(self): """ Initialise the sub-solver """ @@ -1112,12 +1117,10 @@ def _sync_wait(future): Return a copy of the pinned chi-squared, pinned model visibilities and index into the CPU array after synchronizing on the cuda_event """ - if self.enable_sersic_grad: - cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\ - pool_refs, pool_lock, cpu, gpu = future.result() - else: - cuda_event, pinned_X2, pinned_model_vis, \ - pool_refs, pool_lock, cpu, gpu = future.result() + data = future.result() + + cuda_event, pool_refs, pool_lock, cpu, gpu = (data[k] for k in + ('sync_event', 'pool_refs', 'pool_lock', 'cpu', 'gpu')) try: cuda_event.synchronize() @@ -1142,17 +1145,18 @@ def _sync_wait(future): model_vis_shape = tuple(cpu[s].stop - cpu[s].start if s in cpu else s for s in model_vis_sshape) - X2 = pinned_X2.copy() - model_vis = pinned_model_vis.copy().reshape(model_vis_shape) + result = { + 'X2' : data['sub_X2'].copy(), + 'model_vis' : data['model_vis'].copy().reshape(model_vis_shape), + 'model_vis_idx' : model_vis_idx, + } + if self.enable_sersic_grad: - X2_grad = pinned_X2_grad.copy() + result['X2_grad'] = data['sub_X2_grad'].copy() _free_pool_allocs(pool_refs, pool_lock) - if self.enable_sersic_grad: - return X2, model_vis, model_vis_idx, X2_grad - else: - return X2, model_vis, model_vis_idx + return result # For easier typing C = CompositeRimeSolver @@ -1244,15 +1248,18 @@ def _accumulate(model_slice, model_chunk): s.discard(f) # Get chi-squared and model visibilities (and X2 gradient) - if self.enable_sersic_grad: - X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - X2_grad += pinned_X2_grad - else: - X2, pinned_model_vis, model_vis_idx = f.result() + result = f.result() # Sum X2 - X2_sum += X2 + X2_sum += result['X2'] + + if self.enable_sersic_grad: + X2_sum += result['X2_grad'] + + model_vis_idx = result['model_vis_idx'] + pinned_model_vis = result['model_vis'] # Write model visibilities to the numpy array - vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:]) + vis_write(self.model_vis[model_vis_idx], + pinned_model_vis[:]) # Break out if we've removed the prescribed # number of futures @@ -1267,15 +1274,18 @@ def _accumulate(model_slice, model_chunk): # Sum remaining chi-squared for f in done: # Get chi-squared and model visibilities (and X2 gradient) - if self.enable_sersic_grad: - X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - X2_grad += pinned_X2_grad - else: - X2, pinned_model_vis, model_vis_idx = f.result() + result = f.result() # Sum X2 - X2_sum += X2 + X2_sum += result['X2'] + + if self.enable_sersic_grad: + X2_sum += result['X2_grad'] + + model_vis_idx = result['model_vis_idx'] + pinned_model_vis = result['model_vis'] # Write model visibilities to the numpy array - vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:]) + vis_write(self.model_vis[model_vis_idx], + pinned_model_vis[:]) # Set the chi-squared value possibly # taking the weight vector into account From 15e99b84092f07e79596ef0354935b7bc4f9315c Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 8 Sep 2016 17:01:17 +0100 Subject: [PATCH 27/33] Revert "Pass results via dict, instead of tuple" This reverts commit 7457d38869f01436afc5b23bbe674bdb8d98fd9c. --- montblanc/impl/rime/v5/CompositeRimeSolver.py | 84 ++++++++----------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 1ef142239..5bbb6abd7 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -1049,22 +1049,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): pool_refs['X2_gpu'].append(X2_gpu_ary) pool_refs['X2_cpu'].append(sub_X2) pool_refs['model_vis_output'].append(model_vis) - - data = { 'sync_event' : sync_event, - 'sub_X2' : sub_X2, - 'model_vis' : model_vis, - 'pool_refs' : pool_refs, - 'pool_lock' : subslvr.pool_lock, - 'cpu' : cpu_slice_map.copy(), - 'gpu' : gpu_slice_map.copy(), - } - - # Add X2 gradient data if self.enable_sersic_grad: - data['sub_X2_grad'] = sub_X2_grad - data['pool_refs']['X2_grad'].append(sub_X2_grad) - - return data + pool_refs['X2_grad'].append(sub_X2_grad) + return (sync_event, sub_X2, model_vis, sub_X2_grad, + pool_refs, subslvr.pool_lock, + cpu_slice_map.copy(), + gpu_slice_map.copy()) + else: + return (sync_event, sub_X2, model_vis, + pool_refs, subslvr.pool_lock, + cpu_slice_map.copy(), + gpu_slice_map.copy()) def initialise(self): """ Initialise the sub-solver """ @@ -1117,10 +1112,12 @@ def _sync_wait(future): Return a copy of the pinned chi-squared, pinned model visibilities and index into the CPU array after synchronizing on the cuda_event """ - data = future.result() - - cuda_event, pool_refs, pool_lock, cpu, gpu = (data[k] for k in - ('sync_event', 'pool_refs', 'pool_lock', 'cpu', 'gpu')) + if self.enable_sersic_grad: + cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\ + pool_refs, pool_lock, cpu, gpu = future.result() + else: + cuda_event, pinned_X2, pinned_model_vis, \ + pool_refs, pool_lock, cpu, gpu = future.result() try: cuda_event.synchronize() @@ -1145,18 +1142,17 @@ def _sync_wait(future): model_vis_shape = tuple(cpu[s].stop - cpu[s].start if s in cpu else s for s in model_vis_sshape) - result = { - 'X2' : data['sub_X2'].copy(), - 'model_vis' : data['model_vis'].copy().reshape(model_vis_shape), - 'model_vis_idx' : model_vis_idx, - } - + X2 = pinned_X2.copy() + model_vis = pinned_model_vis.copy().reshape(model_vis_shape) if self.enable_sersic_grad: - result['X2_grad'] = data['sub_X2_grad'].copy() + X2_grad = pinned_X2_grad.copy() _free_pool_allocs(pool_refs, pool_lock) - return result + if self.enable_sersic_grad: + return X2, model_vis, model_vis_idx, X2_grad + else: + return X2, model_vis, model_vis_idx # For easier typing C = CompositeRimeSolver @@ -1248,18 +1244,15 @@ def _accumulate(model_slice, model_chunk): s.discard(f) # Get chi-squared and model visibilities (and X2 gradient) - result = f.result() - # Sum X2 - X2_sum += result['X2'] - if self.enable_sersic_grad: - X2_sum += result['X2_grad'] - - model_vis_idx = result['model_vis_idx'] - pinned_model_vis = result['model_vis'] + X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() + X2_grad += pinned_X2_grad + else: + X2, pinned_model_vis, model_vis_idx = f.result() + # Sum X2 + X2_sum += X2 # Write model visibilities to the numpy array - vis_write(self.model_vis[model_vis_idx], - pinned_model_vis[:]) + vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:]) # Break out if we've removed the prescribed # number of futures @@ -1274,18 +1267,15 @@ def _accumulate(model_slice, model_chunk): # Sum remaining chi-squared for f in done: # Get chi-squared and model visibilities (and X2 gradient) - result = f.result() - # Sum X2 - X2_sum += result['X2'] - if self.enable_sersic_grad: - X2_sum += result['X2_grad'] - - model_vis_idx = result['model_vis_idx'] - pinned_model_vis = result['model_vis'] + X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() + X2_grad += pinned_X2_grad + else: + X2, pinned_model_vis, model_vis_idx = f.result() + # Sum X2 + X2_sum += X2 # Write model visibilities to the numpy array - vis_write(self.model_vis[model_vis_idx], - pinned_model_vis[:]) + vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:]) # Set the chi-squared value possibly # taking the weight vector into account From 4879f6b2ec0dc00a908bb5dbe459185532b5813e Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 8 Sep 2016 17:52:02 +0100 Subject: [PATCH 28/33] replaced atomic addition in the shared memory with the block_reduce function --- .../rime/v4/gpu/SersicChiSquaredGradient.py | 53 ++++++++----------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 89ed7b810..02e7fe463 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -34,15 +34,15 @@ FLOAT_PARAMS = { 'BLOCKDIMX': 4, # Number of channels*4 polarisations - 'BLOCKDIMY': 25, # Number of baselines - 'BLOCKDIMZ': 10, # Number of timesteps + 'BLOCKDIMY': 8, # Number of baselines + 'BLOCKDIMZ': 32, # Number of timesteps 'maxregs': 48 # Maximum number of registers } DOUBLE_PARAMS = { 'BLOCKDIMX': 4, # Number of channels*4 polarisations - 'BLOCKDIMY': 25, # Number of baselines - 'BLOCKDIMZ': 10, # Number of timesteps + 'BLOCKDIMY': 8, # Number of baselines + 'BLOCKDIMZ': 32, # Number of timesteps 'maxregs': 63 # Maximum number of registers } @@ -50,6 +50,7 @@ #include #include #include \"math_constants.h\" +//#include #include #include @@ -143,6 +144,12 @@ class SumCohTraits { if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan)) return; + // --- Specialize BlockReduce for type float. + typedef cub::BlockReduce BlockReduceT; + + // --- Allocate temporary storage in shared memory + __shared__ typename BlockReduceT::TempStorage temp_storage; + __shared__ struct { typename SumCohTraits::UVWType uvw[BLOCKDIMZ][BLOCKDIMY]; @@ -223,10 +230,7 @@ class SumCohTraits { i += DEXT(nssrc); shared.e2 = sersic_shape[i]; i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; } - if(threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) - shared.X2_grad_part[threadIdx.x] = 0.; - - __syncthreads(); + __syncthreads(); // Create references to a // complicated part of shared memory @@ -296,31 +300,20 @@ class SumCohTraits { { dev_vis[p].x *= delta.x; dev_vis[p].y *= delta.y; + + dev_vis[p].x += dev_vis[p].y; + dev_vis[p].x *= 6; - typename Tr::ct other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 2); - // Add polarisations 2 and 3 to 0 and 1 - if((POLCHAN & 0x3) < 2) - { - dev_vis[p].x += other.x; - dev_vis[p].y += other.y; - } - other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 1); - - // If this is the polarisation 0, add polarisation 1 - // and write out this chi squared grad term - if((POLCHAN & 0x3) == 0) - { - dev_vis[p].x += other.x; - dev_vis[p].y += other.y; - - //atomic addition to avoid concurrent access in the shared memory - dev_vis[p].x += dev_vis[p].y; - dev_vis[p].x *= 6; - atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x); - } + __syncthreads(); + + // block reduction, result available only in the first thread + T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x); + + if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0) + shared.X2_grad_part[p] = aggregate; } - __syncthreads(); + __syncthreads(); //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) // 3 different threads writes each a different component to avoid serialisation if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) From 6e30f12d346951ceed462622acdca17023381847 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Mon, 19 Sep 2016 16:33:09 +0100 Subject: [PATCH 29/33] fixed block_reduce bug in case of block dimensions not factors of nbaselines and ntimes --- .../rime/v4/gpu/SersicChiSquaredGradient.py | 150 +++++++++--------- 1 file changed, 79 insertions(+), 71 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 02e7fe463..1a8a2ab7d 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -34,7 +34,7 @@ FLOAT_PARAMS = { 'BLOCKDIMX': 4, # Number of channels*4 polarisations - 'BLOCKDIMY': 8, # Number of baselines + 'BLOCKDIMY': 8, # Number of baselines 'BLOCKDIMZ': 32, # Number of timesteps 'maxregs': 48 # Maximum number of registers } @@ -86,7 +86,7 @@ class SumCohTraits { #define LEXT(name) C.name.lower_extent #define UEXT(name) C.name.upper_extent #define DEXT(name) (C.name.upper_extent - C.name.lower_extent) - + #define NA (C.na.local_size) #define NBL (C.nbl.local_size) #define NCHAN (C.nchan.local_size) @@ -140,10 +140,7 @@ class SumCohTraits { int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x; int BL = blockIdx.y*blockDim.y + threadIdx.y; int TIME = blockIdx.z*blockDim.z + threadIdx.z; - - if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan)) - return; - + // --- Specialize BlockReduce for type float. typedef cub::BlockReduce BlockReduceT; @@ -163,22 +160,28 @@ class SumCohTraits { } shared; + int i, ANT1, ANT2; typename Tr::ct dev_vis[3]; + dev_vis[0].x = 0.; dev_vis[1].x = 0; dev_vis[2].x = 0; + dev_vis[0].y = 0.; dev_vis[1].y = 0; dev_vis[2].y = 0; + typename Tr::ct delta; + delta.x = 0.; delta.y = 0.; - T & U = shared.uvw[threadIdx.z][threadIdx.y].x; - T & V = shared.uvw[threadIdx.z][threadIdx.y].y; - T & W = shared.uvw[threadIdx.z][threadIdx.y].z; - - int i; - - // Figure out the antenna pairs - i = TIME*NBL + BL; - int ANT1 = antenna1[i]; - int ANT2 = antenna2[i]; - - // UVW coordinates vary by baseline and time, but not polarised channel - if(threadIdx.x == 0) + if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan)) { + + T & U = shared.uvw[threadIdx.z][threadIdx.y].x; + T & V = shared.uvw[threadIdx.z][threadIdx.y].y; + T & W = shared.uvw[threadIdx.z][threadIdx.y].z; + + // Figure out the antenna pairs + i = TIME*NBL + BL; + ANT1 = antenna1[i]; + ANT2 = antenna2[i]; + + // UVW coordinates vary by baseline and time, but not polarised channel + if(threadIdx.x == 0) + { // UVW, calculated from u_pq = u_p - u_q i = TIME*NA + ANT1; shared.uvw[threadIdx.z][threadIdx.y] = uvw[i]; @@ -188,49 +191,53 @@ class SumCohTraits { U -= ant2_uvw.x; V -= ant2_uvw.y; W -= ant2_uvw.z; - } + } - // Wavelength varies by channel, but not baseline and time - // TODO uses 4 times the actually required space, since - // we don't need to store a frequency per polarisation - if(threadIdx.y == 0 && threadIdx.z == 0) + // Wavelength varies by channel, but not baseline and time + // TODO uses 4 times the actually required space, since + // we don't need to store a frequency per polarisation + if(threadIdx.y == 0 && threadIdx.z == 0) { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; } - i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN; - typename Tr::ct delta = observed_vis[i]; - delta.x -= visibilities[i].x; - delta.y -= visibilities[i].y; + i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN; + delta = observed_vis[i]; + delta.x -= visibilities[i].x; + delta.y -= visibilities[i].y; - // Zero the polarisation if it is flagged - if(flag[i] > 0) - { + // Zero the polarisation if it is flagged + if(flag[i] > 0) + { delta.x = 0; delta.y = 0; - } + } - // Apply any necessary weighting factors - if(apply_weights) - { + // Apply any necessary weighting factors + if(apply_weights) + { T w = weight_vector[i]; delta.x *= w; delta.y *= w; + } } - int SRC_START = DEXT(npsrc) + DEXT(ngsrc); int SRC_STOP = SRC_START + DEXT(nssrc); // Loop over Sersic Sources for(int SRC = SRC_START; SRC < SRC_STOP; ++SRC) { - // sersic shape only varies by source. Shape parameters - // thus apply to the entire block and we can load them with - // only the first thread. - if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) - { - i = SRC - SRC_START; shared.e1 = sersic_shape[i]; - i += DEXT(nssrc); shared.e2 = sersic_shape[i]; - i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; - } - __syncthreads(); + + // sersic shape only varies by source. Shape parameters + // thus apply to the entire block and we can load them with + // only the first thread. + if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + { + i = SRC - SRC_START; shared.e1 = sersic_shape[i]; + i += DEXT(nssrc); shared.e2 = sersic_shape[i]; + i += DEXT(nssrc); shared.sersic_scale = sersic_shape[i]; + } + __syncthreads(); + + if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan)) + { // Create references to a // complicated part of shared memory @@ -279,7 +286,7 @@ class SumCohTraits { dev_vis[1].y = ant_one.y*dev_e2; dev_vis[2].x = ant_one.x*dev_scale; dev_vis[2].y = ant_one.y*dev_scale; - /* + for (int p = 0; p < 3; p++) { // Multiply the visibility derivative by antenna 1's g term @@ -293,35 +300,36 @@ class SumCohTraits { montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant1_g_term, ant2_g_term); dev_vis[p] = ant1_g_term; } - - */ - // Write partial derivative with respect to sersic parameters - for (int p=0; p<3; p++) - { - dev_vis[p].x *= delta.x; - dev_vis[p].y *= delta.y; + + } + // Write partial derivative with respect to sersic parameters + for (int p=0; p<3; p++) + { + dev_vis[p].x *= delta.x; + dev_vis[p].y *= delta.y; - dev_vis[p].x += dev_vis[p].y; - dev_vis[p].x *= 6; - - __syncthreads(); + dev_vis[p].x += dev_vis[p].y; + dev_vis[p].x *= 6; + + __syncthreads(); - // block reduction, result available only in the first thread - T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x); + // block reduction, result available only in the first thread + T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x); - if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0) - shared.X2_grad_part[p] = aggregate; - } + if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0) + shared.X2_grad_part[p] = aggregate; + } - __syncthreads(); - //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) - // 3 different threads writes each a different component to avoid serialisation - if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) - { - i = SRC - SRC_START + threadIdx.x*DEXT(nssrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); - } + __syncthreads(); + //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) + // 3 different threads writes each a different component to avoid serialisation + if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) + { + i = SRC - SRC_START + threadIdx.x*DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); + } } + } extern "C" { From 2795b5ff24788c0a3c59da97b171065801875c9e Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Wed, 10 May 2017 17:45:47 +0100 Subject: [PATCH 30/33] Enabled single polarization for GPU version --- montblanc/factory.py | 3 +- montblanc/impl/common/loaders/loaders.py | 12 +- montblanc/impl/rime/v4/config.py | 29 +- montblanc/impl/rime/v4/gpu/RimeBSqrt.py | 86 +++++- montblanc/impl/rime/v4/gpu/RimeEBeam.py | 287 +++++++++++++++++- montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py | 13 +- .../impl/rime/v4/gpu/RimeSumCoherencies.py | 73 +++-- .../rime/v4/gpu/SersicChiSquaredGradient.py | 18 +- montblanc/impl/rime/v4/loaders/loaders.py | 2 +- montblanc/impl/rime/v5/CompositeRimeSolver.py | 11 +- montblanc/impl/rime/v5/loaders/loaders.py | 2 +- 11 files changed, 472 insertions(+), 64 deletions(-) diff --git a/montblanc/factory.py b/montblanc/factory.py index f9d3f4ad9..eae4c4b54 100644 --- a/montblanc/factory.py +++ b/montblanc/factory.py @@ -137,12 +137,13 @@ def create_rime_solver_from_ms(slvr_class_type, slvr_cfg): autocor = slvr_cfg.get(Options.AUTO_CORRELATIONS) with MeasurementSetLoader(msfile, auto_correlations=autocor) as loader: - ntime, nbl, na, nbands, nchan = loader.get_dims() + ntime, nbl, na, nbands, nchan, npol = loader.get_dims() slvr_cfg[Options.NTIME] = ntime slvr_cfg[Options.NA] = na slvr_cfg[Options.NBL] = nbl slvr_cfg[Options.NBANDS] = nbands slvr_cfg[Options.NCHAN] = nchan + slvr_cfg[Options.NPOL] = npol slvr = slvr_class_type(slvr_cfg) loader.load(slvr, slvr_cfg) return slvr diff --git a/montblanc/impl/common/loaders/loaders.py b/montblanc/impl/common/loaders/loaders.py index b6f31d9cd..3ac524989 100644 --- a/montblanc/impl/common/loaders/loaders.py +++ b/montblanc/impl/common/loaders/loaders.py @@ -31,6 +31,7 @@ ANTENNA_TABLE = 'ANTENNA' SPECTRAL_WINDOW = 'SPECTRAL_WINDOW' FIELD_TABLE = 'FIELD' +POL_TABLE = 'POLARIZATION' class MeasurementSetLoader(BaseLoader): LOG_PREFIX = 'LOADER:' @@ -43,6 +44,7 @@ def __init__(self, msfile, auto_correlations=False): self.antfile = '::'.join((self.msfile, ANTENNA_TABLE)) self.freqfile = '::'.join((self.msfile, SPECTRAL_WINDOW)) self.fieldfile = '::'.join((self.msfile, FIELD_TABLE)) + self.polfile = '::'.join((self.msfile, POL_TABLE)) montblanc.log.info("{lp} Opening Measurement Set {ms}.".format( lp=self.LOG_PREFIX, ms=self.msfile)) @@ -58,6 +60,7 @@ def __init__(self, msfile, auto_correlations=False): # (2) baseline (ANTENNA1, ANTENNA2) # (3) band (SPECTRAL_WINDOW_ID via DATA_DESC_ID) ordering_query = ' '.join(["SELECT FROM $ms", + #"WHERE ANTENNA1 != ANTENNA2", "WHERE FIELD_ID={fid}".format(fid=field_id), "" if auto_correlations else "AND ANTENNA1 != ANTENNA2", "ORDERBY TIME, ANTENNA1, ANTENNA2, " @@ -70,6 +73,7 @@ def __init__(self, msfile, auto_correlations=False): self.tables['ant'] = at = pt.table(self.antfile, ack=False, readonly=True) self.tables['freq'] = ft = pt.table(self.freqfile, ack=False, readonly=True) self.tables['field'] = fit = pt.table(self.fieldfile, ack=False, readonly=True) + self.tables['pol'] = polt = pt.table(self.polfile, ack=False, readonly=True) self.nrows = ordered_ms.nrows() self.na = at.nrows() @@ -80,6 +84,9 @@ def __init__(self, msfile, auto_correlations=False): bl_query = "SELECT FROM $ordered_ms ORDERBY UNIQUE ANTENNA1, ANTENNA2" self.nbl = pt.taql(bl_query).nrows() + # Number of polarizations (assuming to be the same for all spectral windows) + self.npol = polt.getcol('NUM_CORR')[0] + # Number of channels per band chan_per_band = ft.getcol('NUM_CHAN') @@ -105,6 +112,8 @@ def __init__(self, msfile, auto_correlations=False): aeq=autocor_eq, ebl=expected_nbl, astr=autocor_str, nbl=self.nbl)) self.log("Found {nb} band(s), containing {cpb} channels.".format( nb=self.nbands, nc=chan_per_band[0], cpb=chan_per_band)) + self.log("Found {npol} polarization(s) in the POLARIZATION table.".format( + npol=self.npol)) # Sanity check computed rows vs actual rows computed_rows = self.ntime*self.nbl*self.nbands @@ -116,13 +125,14 @@ def __init__(self, msfile, auto_correlations=False): nbl=self.nbl, nb=self.nbands, cr=computed_rows, msr=self.nrows)) + def get_dims(self): """ Returns a tuple with the number of timesteps, baselines, antenna, channel bands and channels """ # Determine the problem dimensions - return self.ntime, self.nbl, self.na, self.nbands, self.nchan + return self.ntime, self.nbl, self.na, self.nbands, self.nchan, self.npol def log(self, msg, *args, **kwargs): montblanc.log.info('{lp} {m}'.format(lp=self.LOG_PREFIX, m=msg), diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py index d88bfb2d3..bf801d3ed 100644 --- a/montblanc/impl/rime/v4/config.py +++ b/montblanc/impl/rime/v4/config.py @@ -127,6 +127,8 @@ class Classifier(Enum): EKB_SQRT_INPUT = 'ekb_sqrt_input' COHERENCIES_INPUT = 'coherencies_input' + + # List of arrays A = [ # UVW coordinates @@ -177,15 +179,17 @@ class Classifier(Enum): default=1, test=lambda slvr, ary: rary(ary)), - ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 4), 'ct', + ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 'npol'), 'ct', classifiers=frozenset([Classifier.E_BEAM_INPUT]), - default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:], + default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 + else np.array([1])[np.newaxis,np.newaxis,:]), test=lambda slvr, ary: rary(ary)), # Direction-Independent Effects - ary_dict('G_term', ('ntime', 'na', 'nchan', 4), 'ct', + ary_dict('G_term', ('ntime', 'na', 'nchan', 'npol'), 'ct', classifiers=frozenset([Classifier.COHERENCIES_INPUT]), - default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:], + default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 + else np.array([1])[np.newaxis,np.newaxis,:]), test=lambda slvr, ary: rary(ary)), # Source Definitions @@ -195,9 +199,10 @@ class Classifier(Enum): default=0, test=lambda slvr, ary: (rary(ary)-0.5) * 1e-1), - ary_dict('stokes', ('nsrc','ntime', 4), 'ft', + ary_dict('stokes', ('nsrc','ntime', 'npol'), 'ft', classifiers=frozenset([Classifier.B_SQRT_INPUT]), - default=np.array([1,0,0,0])[np.newaxis,np.newaxis,:], + default= lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[2] == 4 + else np.array([1])[np.newaxis,np.newaxis,:]), test=rand_stokes), ary_dict('alpha', ('nsrc','ntime'), 'ft', @@ -216,7 +221,7 @@ class Classifier(Enum): test=rand_sersic_shape), # Visibility flagging arrays - ary_dict('flag', ('ntime', 'nbl', 'nchan', 4), np.uint8, + ary_dict('flag', ('ntime', 'nbl', 'nchan', 'npol'), np.uint8, classifiers=frozenset([Classifier.X2_INPUT, Classifier.COHERENCIES_INPUT]), default=0, @@ -224,23 +229,23 @@ class Classifier(Enum): 0, 1, size=ary.shape)), # Bayesian Data - ary_dict('weight_vector', ('ntime','nbl','nchan',4), 'ft', + ary_dict('weight_vector', ('ntime','nbl','nchan','npol'), 'ft', classifiers=frozenset([Classifier.X2_INPUT, Classifier.COHERENCIES_INPUT]), default=1, test=lambda slvr, ary: rary(ary)), - ary_dict('observed_vis', ('ntime','nbl','nchan',4), 'ct', + ary_dict('observed_vis', ('ntime','nbl','nchan','npol'), 'ct', classifiers=frozenset([Classifier.X2_INPUT, Classifier.COHERENCIES_INPUT]), default=0, test=lambda slvr, ary: rary(ary)), # Result arrays - ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 4), 'ct', + ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 'npol'), 'ct', classifiers=frozenset([Classifier.GPU_SCRATCH])), - ary_dict('jones', ('nsrc','ntime','na','nchan',4), 'ct', + ary_dict('jones', ('nsrc','ntime','na','nchan','npol'), 'ct', classifiers=frozenset([Classifier.GPU_SCRATCH])), - ary_dict('model_vis', ('ntime','nbl','nchan',4), 'ct', + ary_dict('model_vis', ('ntime','nbl','nchan','npol'), 'ct', classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])), ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft', classifiers=frozenset([Classifier.GPU_SCRATCH])), diff --git a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py index 784a2152a..1e56313fe 100644 --- a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py +++ b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py @@ -127,6 +127,59 @@ B_sqrt[i] = B_square_root; } + + +// Case NPOL = 1 + +template < + typename T, + typename Tr=montblanc::kernel_traits, + typename Po=montblanc::kernel_policies > +__device__ +void rime_jones1_B_sqrt_impl( + T * stokes, + T * alpha, + T * frequency, + T * ref_frequency, + typename Tr::ct * B_sqrt) +{ + int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x; + int TIME = blockIdx.y*blockDim.y + threadIdx.y; + int SRC = blockIdx.z*blockDim.z + threadIdx.z; + + if(SRC >= DEXT(nsrc) || TIME >= DEXT(ntime) || POLCHAN >= DEXT(nchan)) + { return; } + + __shared__ T freq_ratio[BLOCKDIMX]; + + // TODO. Using 3 times more shared memory than we + // really require here, since there's only + // one frequency per channel. + if(threadIdx.y == 0 && threadIdx.z == 0) + { + freq_ratio[threadIdx.x] = frequency[POLCHAN] / + ref_frequency[POLCHAN]; + } + + __syncthreads(); + + // Calculate the power term + int i = SRC*NTIME + TIME; + typename Tr::ft power = Po::pow(freq_ratio[threadIdx.x], alpha[i]); + + // Read in the stokes parameter, + // multiplying it by the power term + typename Tr::ft pol = stokes[i]*power; + + // Write out the square root of the brightness + i = (SRC*NTIME + TIME)*NPOLCHAN + POLCHAN; + B_sqrt[i].x = Po::sqrt(pol); + B_sqrt[i].y = 0.; +} + + + + extern "C" { #define stamp_jones_B_sqrt_fn(ft,ct) \ @@ -142,9 +195,27 @@ frequency, ref_frequency, B_sqrt); \ } +#define stamp_jones1_B_sqrt_fn(ft,ct) \ +__global__ void \ +rime_jones1_B_sqrt_ ## ft( \ + ft * stokes, \ + ft * alpha, \ + ft * frequency, \ + ft * ref_frequency, \ + ct * B_sqrt) \ +{ \ + rime_jones1_B_sqrt_impl(stokes, alpha, \ + frequency, ref_frequency, B_sqrt); \ +} + + stamp_jones_B_sqrt_fn(float,float2); stamp_jones_B_sqrt_fn(double,double2); +stamp_jones1_B_sqrt_fn(float,float2); +stamp_jones1_B_sqrt_fn(double,double2); + + } // extern "C" { """) @@ -154,8 +225,8 @@ def __init__(self): def initialise(self, solver, stream=None): slvr = solver - nsrc, ntime, npolchan = slvr.dim_local_size( - 'nsrc', 'ntime', 'npolchan') + nsrc, ntime, npolchan, npol = slvr.dim_local_size( + 'nsrc', 'ntime', 'npolchan', 'npol') # Get a property dictionary off the solver D = slvr.template_dict() @@ -171,9 +242,14 @@ def initialise(self, solver, stream=None): regs = str(FLOAT_PARAMS['maxregs'] \ if slvr.is_float() else DOUBLE_PARAMS['maxregs']) - kname = 'rime_jones_B_sqrt_float' \ - if slvr.is_float() is True else \ - 'rime_jones_B_sqrt_double' + if npol == 1: + kname = 'rime_jones1_B_sqrt_float' \ + if slvr.is_float() is True else \ + 'rime_jones1_B_sqrt_double' + else: + kname = 'rime_jones_B_sqrt_float' \ + if slvr.is_float() is True else \ + 'rime_jones_B_sqrt_double' kernel_string = KERNEL_TEMPLATE.substitute(**D) diff --git a/montblanc/impl/rime/v4/gpu/RimeEBeam.py b/montblanc/impl/rime/v4/gpu/RimeEBeam.py index c174f2640..1d9215406 100644 --- a/montblanc/impl/rime/v4/gpu/RimeEBeam.py +++ b/montblanc/impl/rime/v4/gpu/RimeEBeam.py @@ -83,18 +83,19 @@ #define BEAM_MH LOCAL(beam_mh) #define BEAM_NUD LOCAL(beam_nud) +// Infer channel from x thread + __device__ __forceinline__ int thread_chan() + { return threadIdx.x >> 2; } + // Infer polarisation from x thread __device__ __forceinline__ int ebeam_pol() { return threadIdx.x & 0x3; } -// Infer channel from x thread -__device__ __forceinline__ int thread_chan() - { return threadIdx.x >> 2; } template < typename T, typename Tr=montblanc::kernel_traits, - typename Po=montblanc::kernel_policies > + typename Po=montblanc::kernel_policies > __device__ __forceinline__ void trilinear_interpolate( typename Tr::ct & pol_sum, @@ -105,8 +106,31 @@ const T & gchan, const T & weight) { - int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL - + ebeam_pol(); + int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL +ebeam_pol(); + + // Perhaps unnecessary as long as BLOCKDIMX is 32 + typename Tr::ct pol = cub::ThreadLoad(E_beam + i); + pol_sum.x += weight*pol.x; + pol_sum.y += weight*pol.y; + abs_sum += weight*Po::abs(pol); +} + + +template < + typename T, + typename Tr=montblanc::kernel_traits, + typename Po=montblanc::kernel_policies > +__device__ __forceinline__ +void trilinear_interpolate_J1( + typename Tr::ct & pol_sum, + typename Tr::ft & abs_sum, + typename Tr::ct * E_beam, + const T & gl, + const T & gm, + const T & gchan, + const T & weight) +{ + int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan)); // Perhaps unnecessary as long as BLOCKDIMX is 32 typename Tr::ct pol = cub::ThreadLoad(E_beam + i); @@ -115,6 +139,7 @@ abs_sum += weight*Po::abs(pol); } + template class EBeamTraits {}; template <> class EBeamTraits @@ -343,6 +368,216 @@ } } + + +template < + typename T, + typename Tr=montblanc::kernel_traits, + typename Po=montblanc::kernel_policies > +__device__ +void rime_jones1_E_beam_impl( + typename EBeamTraits::LMType * lm, + typename EBeamTraits::ParallacticAngleType * parallactic_angles, + typename EBeamTraits::PointErrorType * point_errors, + typename EBeamTraits::AntennaScaleType * antenna_scaling, + typename Tr::ft * frequency, + typename Tr::ct * E_beam, + typename Tr::ct * jones, + T beam_ll, T beam_lm, T beam_lfreq, + T beam_ul, T beam_um, T beam_ufreq) +{ + int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x; + int ANT = blockIdx.y*blockDim.y + threadIdx.y; + int TIME = blockIdx.z*blockDim.z + threadIdx.z; + + typedef typename EBeamTraits::LMType LMType; + typedef typename EBeamTraits::PointErrorType PointErrorType; + typedef typename EBeamTraits::AntennaScaleType AntennaScaleType; + + if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan)) + return; + + __shared__ struct { + T lscale; // l axis scaling factor + T mscale; // m axis scaling factor + T pa_sin[BLOCKDIMZ][BLOCKDIMY]; // sin of parallactic angle + T pa_cos[BLOCKDIMZ][BLOCKDIMY]; // cos of parallactic angle + T gchan0[BLOCKDIMX]; // channel grid position (snapped) + T gchan1[BLOCKDIMX]; // channel grid position (snapped) + T chd[BLOCKDIMX]; // difference between gchan0 and actual grid position + PointErrorType pe[BLOCKDIMZ][BLOCKDIMY][BLOCKDIMX]; // pointing errors + AntennaScaleType as[BLOCKDIMY][BLOCKDIMX]; // antenna scaling + } shared; + + + int i; + + // Precompute l and m scaling factors in shared memory + if(threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.z == 0) + { + shared.lscale = T(BEAM_LW - 1) / (beam_ul - beam_ll); + shared.mscale = T(BEAM_MH - 1) / (beam_um - beam_lm); + } + + // Pointing errors vary by time, antenna and channel, + i = (TIME*NA + ANT)*NCHAN + POLCHAN; + shared.pe[threadIdx.z][threadIdx.y][threadIdx.x] = point_errors[i]; + + // Antenna scaling factors vary by antenna and channel, + // but not timestep + if(threadIdx.z == 0) + { + i = ANT*NCHAN + POLCHAN; + shared.as[threadIdx.y][threadIdx.x] = antenna_scaling[i]; + } + + // Frequency vary by channel, but not timestep or antenna + if(threadIdx.z == 0 && threadIdx.y == 0) + { + // Channel coordinate + // channel grid position + T freqscale = T(BEAM_NUD-1) / (beam_ufreq - beam_lfreq); + T chan = freqscale * (frequency[POLCHAN] - beam_lfreq); + // clamp to grid edges + chan = Po::clamp(chan, 0.0, BEAM_NUD-1); + // Snap to grid coordinate + shared.gchan0[threadIdx.x] = Po::floor(chan); + shared.gchan1[threadIdx.x] = Po::min( + shared.gchan0[threadIdx.x] + 1.0, BEAM_NUD-1); + // Offset of snapped coordinate from grid position + shared.chd[threadIdx.x] = chan - shared.gchan0[threadIdx.x]; + } + + // Parallactic angles vary by time and antenna, but not channel + if(threadIdx.x == 0) + { + i = TIME*NA + ANT; + T parangle = parallactic_angles[i]; + Po::sincos(parangle, + &shared.pa_sin[threadIdx.z][threadIdx.y], + &shared.pa_cos[threadIdx.z][threadIdx.y]); + } + + __syncthreads(); + + // Loop over sources + for(int SRC=0; SRC < DEXT(nsrc); ++SRC) + { + // lm coordinate for this source + LMType rlm = lm[SRC]; + + // L coordinate + // Rotate + T l = rlm.x*shared.pa_cos[threadIdx.z][threadIdx.y] - + rlm.y*shared.pa_sin[threadIdx.z][threadIdx.y]; + // Add the pointing errors for this antenna. + l += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].x; + // Scale by antenna scaling factors + l *= shared.as[threadIdx.y][threadIdx.x].x; + // l grid position + l = shared.lscale * (l - beam_ll); + // clamp to grid edges + l = Po::clamp(0.0, l, BEAM_LW-1); + // Snap to grid coordinate + T gl0 = Po::floor(l); + T gl1 = Po::min(gl0 + 1.0, BEAM_LW-1); + // Offset of snapped coordinate from grid position + T ld = l - gl0; + + // M coordinate + // rotate + T m = rlm.x*shared.pa_sin[threadIdx.z][threadIdx.y] + + rlm.y*shared.pa_cos[threadIdx.z][threadIdx.y]; + // Add the pointing errors for this antenna. + m += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].y; + // Scale by antenna scaling factors + m *= shared.as[threadIdx.y][threadIdx.x].y; + // m grid position + m = shared.mscale * (m - beam_lm); + // clamp to grid edges + m = Po::clamp(0.0, m, BEAM_MH-1); + // Snap to grid position + T gm0 = Po::floor(m); + T gm1 = Po::min(gm0 + 1.0, BEAM_MH-1); + // Offset of snapped coordinate from grid position + T md = m - gm0; + + typename Tr::ct pol_sum = Po::make_ct(0.0, 0.0); + typename Tr::ft abs_sum = T(0.0); + + // A simplified trilinear weighting is used here. Given + // point x between points x1 and x2, with function f + // provided values f(x1) and f(x2) at these points. + // + // x1 ------- x ---------- x2 + // + // Then, the value of f can be approximated using the following: + // f(x) ~= f(x1)(x2-x)/(x2-x1) + f(x2)(x-x1)/(x2-x1) + // + // Note how the value f(x1) is weighted with the distance + // from the opposite point (x2-x). + // + // As we are interpolating on a grid, we have the following + // 1. (x2 - x1) == 1 + // 2. (x - x1) == 1 - 1 + (x - x1) + // == 1 - (x2 - x1) + (x - x1) + // == 1 - (x2 - x) + // 2. (x2 - x) == 1 - 1 + (x2 - x) + // == 1 - (x2 - x1) + (x2 - x) + // == 1 - (x - x1) + // + // Extending the above to 3D, we have + // f(x,y,z) ~= f(x1,y1,z1)(x2-x)(y2-y)(z2-z) + ... + // + f(x2,y2,z2)(x-x1)(y-y1)(z-z1) + // + // f(x,y,z) ~= f(x1,y1,z1)(1-(x-x1))(1-(y-y1))(1-(z-z1)) + ... + // + f(x2,y2,z2) (x-x1) (y-y1) (z-z1) + + // Load in the complex values from the E beam + // at the supplied coordinate offsets. + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl0, gm0, shared.gchan0[threadIdx.x], + (1.0f-ld)*(1.0f-md)*(1.0f-shared.chd[threadIdx.x])); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl1, gm0, shared.gchan0[threadIdx.x], + ld*(1.0f-md)*(1.0f-shared.chd[threadIdx.x])); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl0, gm1, shared.gchan0[threadIdx.x], + (1.0f-ld)*md*(1.0f-shared.chd[threadIdx.x])); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl1, gm1, shared.gchan0[threadIdx.x], + ld*md*(1.0f-shared.chd[threadIdx.x])); + + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl0, gm0, shared.gchan1[threadIdx.x], + (1.0f-ld)*(1.0f-md)*shared.chd[threadIdx.x]); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl1, gm0, shared.gchan1[threadIdx.x], + ld*(1.0f-md)*shared.chd[threadIdx.x]); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl0, gm1, shared.gchan1[threadIdx.x], + (1.0f-ld)*md*shared.chd[threadIdx.x]); + trilinear_interpolate_J1(pol_sum, abs_sum, E_beam, + gl1, gm1, shared.gchan1[threadIdx.x], + ld*md*shared.chd[threadIdx.x]); + + // Normalise the angle and multiply in the absolute sum + typename Tr::ft norm = Po::rsqrt(pol_sum.x*pol_sum.x + pol_sum.y*pol_sum.y); + if(!::isfinite(norm)) + { norm = 1.0; } + + pol_sum.x *= norm * abs_sum; + pol_sum.y *= norm * abs_sum; + + i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN; + jones[i] = pol_sum; + } +} + + + + + extern "C" { #define stamp_jones_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \ @@ -365,6 +600,26 @@ beam_ul, beam_um, beam_ufreq); \ } +#define stamp_jones1_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \ +__global__ void \ +rime_jones1_E_beam_ ## ft( \ + lm_type * lm, \ + pa_type * parallactic_angles, \ + pe_type * point_errors, \ + as_type * antenna_scaling, \ + ft * frequency, \ + ct * E_beam, \ + ct * jones, \ + ft beam_ll, ft beam_lm, ft beam_lfreq, \ + ft beam_ul, ft beam_um, ft beam_ufreq) \ +{ \ + rime_jones1_E_beam_impl( \ + lm, parallactic_angles, point_errors, \ + antenna_scaling, frequency, E_beam, jones, \ + beam_ll, beam_lm, beam_lfreq, \ + beam_ul, beam_um, beam_ufreq); \ +} + ${stamp_function} } // extern "C" { @@ -376,7 +631,7 @@ def __init__(self): def initialise(self, solver, stream=None): slvr = solver - ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan') + ntime, na, npolchan, npol = slvr.dim_local_size('ntime', 'na', 'npolchan', 'npol') # Get a property dictionary off the solver D = slvr.template_dict() @@ -400,12 +655,20 @@ def initialise(self, solver, stream=None): 'float' if slvr.is_float() else 'double', 'float2' if slvr.is_float() else 'double2', 'float2' if slvr.is_float() else 'double2']) - stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')']) - D['stamp_function'] = stamp_fn - kname = 'rime_jones_E_beam_float' \ - if slvr.is_float() is True else \ - 'rime_jones_E_beam_double' + if npol == 4: + stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')']) + D['stamp_function'] = stamp_fn + kname = 'rime_jones_E_beam_float' \ + if slvr.is_float() is True else \ + 'rime_jones_E_beam_double' + + if npol == 1: + stamp_fn = ''.join(['stamp_jones1_E_beam_fn(', stamp_args, ')']) + D['stamp_function'] = stamp_fn + kname = 'rime_jones1_E_beam_float' \ + if slvr.is_float() is True else \ + 'rime_jones1_E_beam_double' kernel_string = KERNEL_TEMPLATE.substitute(**D) diff --git a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py index ba802f801..325d0800a 100644 --- a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py +++ b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py @@ -110,7 +110,6 @@ class EKBTraits { int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x; int ANT = blockIdx.y*blockDim.y + threadIdx.y; int TIME = blockIdx.z*blockDim.z + threadIdx.z; - #define POL (threadIdx.x & 0x3) if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan)) return; @@ -134,7 +133,13 @@ class EKBTraits { // Wavelengths vary by channel, not by time and antenna if(threadIdx.y == 0 && threadIdx.z == 0) + { + if (NPOL > 1) { freq[threadIdx.x] = frequency[POLCHAN>>2]; } + else + { freq[threadIdx.x] = frequency[POLCHAN]; } + } + __syncthreads(); @@ -168,7 +173,11 @@ class EKBTraits { i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN; // Load in the E Beam, and multiply it by KB typename Tr::ct J = jones[i]; - montblanc::jones_multiply_4x4_in_place(J, cplx_phase); + if (NPOL > 1) + montblanc::jones_multiply_4x4_in_place(J, cplx_phase); + else + montblanc::complex_multiply_in_place(J, cplx_phase); + // Write out the jones matrices i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN; diff --git a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py index 991f208fc..049554af7 100644 --- a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py +++ b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py @@ -174,7 +174,12 @@ class SumCohTraits { // TODO uses 4 times the actually required space, since // we don't need to store a frequency per polarisation if(threadIdx.y == 0 && threadIdx.z == 0) + { + if (NPOL > 1) { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; } + else + { shared.freq[threadIdx.x] = frequency[POLCHAN]; } + } // We process sources in batches, accumulating visibility values. // Visibilities are read in and written out at the start and end @@ -202,7 +207,10 @@ class SumCohTraits { typename Tr::ct ant_one = jones[i]; i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN; typename Tr::ct ant_two = jones[i]; - montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + if (NPOL > 1) + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + else + montblanc::complex_conjugate_multiply_in_place(ant_one, ant_two); polsum.x += ant_one.x; polsum.y += ant_one.y; @@ -248,7 +256,11 @@ class SumCohTraits { ant_one.y *= exp; i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN; typename Tr::ct ant_two = jones[i]; - montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + if (NPOL > 1) + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + else + montblanc::complex_conjugate_multiply_in_place(ant_one, ant_two); + polsum.x += ant_one.x; polsum.y += ant_one.y; @@ -298,7 +310,10 @@ class SumCohTraits { ant_one.y *= sersic_factor; i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN; typename Tr::ct ant_two = jones[i]; - montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + if (NPOL > 1) + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + else + montblanc::complex_conjugate_multiply_in_place(ant_one, ant_two); polsum.x += ant_one.x; polsum.y += ant_one.y; @@ -320,12 +335,20 @@ class SumCohTraits { // Multiply the visibility by antenna 1's g term i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN; typename Tr::ct model_vis = G_term[i]; - montblanc::jones_multiply_4x4_in_place(model_vis, polsum); + if (NPOL > 1) + montblanc::jones_multiply_4x4_in_place(model_vis, polsum); + else + montblanc::complex_multiply_in_place(model_vis, polsum); + // Multiply the visibility by antenna 2's g term i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN; typename Tr::ct ant2_g_term = G_term[i]; - montblanc::jones_multiply_4x4_hermitian_transpose_in_place(model_vis, ant2_g_term); + if (NPOL > 1) + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(model_vis, ant2_g_term); + else + montblanc::complex_conjugate_multiply_in_place(model_vis, ant2_g_term); + // Compute the chi squared sum terms i = (TIME*NBL + BL)*NPOLCHAN + POLCHAN; @@ -367,28 +390,36 @@ class SumCohTraits { obs_vis.y *= w; } - // Now, add the real and imaginary components - // of each adjacent group of four polarisations - // into the first polarisation. - typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2); - - // Add polarisations 2 and 3 to 0 and 1 - if((POLCHAN & 0x3) < 2) + if (NPOL > 1) { - obs_vis.x += other.x; - obs_vis.y += other.y; - } - - other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1); - - // If this is the polarisation 0, add polarisation 1 - // and write out this chi squared sum term - if((POLCHAN & 0x3) == 0) { + // Now, add the real and imaginary components + // of each adjacent group of four polarisations + // into the first polarisation. + typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2); + + // Add polarisations 2 and 3 to 0 and 1 + if((POLCHAN & 0x3) < 2) + { + obs_vis.x += other.x; + obs_vis.y += other.y; + } + + other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1); + + // If this is the polarisation 0, add polarisation 1 + // and write out this chi squared sum term + if((POLCHAN & 0x3) == 0) { obs_vis.x += other.x; obs_vis.y += other.y; i = (TIME*NBL + BL)*NCHAN + (POLCHAN >> 2); chi_sqrd_result[i] = obs_vis.x + obs_vis.y; + } + } + else + { + i = (TIME*NBL + BL)*NCHAN + POLCHAN; + chi_sqrd_result[i] = obs_vis.x + obs_vis.y; } } diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 1a8a2ab7d..6138019e7 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -197,7 +197,13 @@ class SumCohTraits { // TODO uses 4 times the actually required space, since // we don't need to store a frequency per polarisation if(threadIdx.y == 0 && threadIdx.z == 0) - { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; } + { + if (NPOL >1) + { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; } + else + { shared.freq[threadIdx.x] = frequency[POLCHAN]; } + } + i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN; delta = observed_vis[i]; @@ -278,7 +284,11 @@ class SumCohTraits { ant_one.y *= sersic_factor; i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN; typename Tr::ct ant_two = jones[i]; - montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + if (NPOL > 1) + montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant_one, ant_two); + else + montblanc::complex_conjugate_multiply_in_place(ant_one, ant_two); + dev_vis[0].x = ant_one.x*dev_e1; dev_vis[0].y = ant_one.y*dev_e1; @@ -286,7 +296,7 @@ class SumCohTraits { dev_vis[1].y = ant_one.y*dev_e2; dev_vis[2].x = ant_one.x*dev_scale; dev_vis[2].y = ant_one.y*dev_scale; - + /* for (int p = 0; p < 3; p++) { // Multiply the visibility derivative by antenna 1's g term @@ -300,7 +310,7 @@ class SumCohTraits { montblanc::jones_multiply_4x4_hermitian_transpose_in_place(ant1_g_term, ant2_g_term); dev_vis[p] = ant1_g_term; } - + */ } // Write partial derivative with respect to sersic parameters for (int p=0; p<3; p++) diff --git a/montblanc/impl/rime/v4/loaders/loaders.py b/montblanc/impl/rime/v4/loaders/loaders.py index eaabc062e..63b1c933f 100644 --- a/montblanc/impl/rime/v4/loaders/loaders.py +++ b/montblanc/impl/rime/v4/loaders/loaders.py @@ -250,4 +250,4 @@ def __enter__(solver): return super(MeasurementSetLoader,solver).__enter__() def __exit__(solver, type, value, traceback): - return super(MeasurementSetLoader,solver).__exit__(type,value,traceback) \ No newline at end of file + return super(MeasurementSetLoader,solver).__exit__(type,value,traceback) diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py index 5bbb6abd7..47d9b98ee 100644 --- a/montblanc/impl/rime/v5/CompositeRimeSolver.py +++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py @@ -307,6 +307,9 @@ def _gen_vis_slices(self): cpu_slice['nvis'] = slice(0, 0, 1) gpu_slice['nvis'] = slice(0, 0, 1) + cpu_slice['npol'] = slice(0, npol, 1) + gpu_slice['npol'] = slice(0, npol, 1) + montblanc.log.debug('Generating RIME slices') # Set up time slicing @@ -1172,7 +1175,7 @@ def _sync_wait(future): X2_sum = self.ft(0.0) if self.enable_sersic_grad: nssrc = self.dim_local_size('nssrc') - X2_grad = self.ft(np.zeros((3,nssrc))) + self.X2_grad = self.ft(np.zeros((3,nssrc))) # Sets of return value futures for each executor value_futures = [set() for ex in self.enqueue_executors] @@ -1246,7 +1249,7 @@ def _accumulate(model_slice, model_chunk): # Get chi-squared and model visibilities (and X2 gradient) if self.enable_sersic_grad: X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - X2_grad += pinned_X2_grad + self.X2_grad += pinned_X2_grad else: X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 @@ -1269,7 +1272,7 @@ def _accumulate(model_slice, model_chunk): # Get chi-squared and model visibilities (and X2 gradient) if self.enable_sersic_grad: X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result() - X2_grad += pinned_X2_grad + self.X2_grad += pinned_X2_grad else: X2, pinned_model_vis, model_vis_idx = f.result() # Sum X2 @@ -1284,7 +1287,7 @@ def _accumulate(model_slice, model_chunk): else: self.set_X2(X2_sum/self.sigma_sqrd) if self.enable_sersic_grad: - self.X2_grad = X2_grad/self.sigma_sqrd + self.X2_grad = self.X2_grad/self.sigma_sqrd def shutdown(self): """ Shutdown the solver """ diff --git a/montblanc/impl/rime/v5/loaders/loaders.py b/montblanc/impl/rime/v5/loaders/loaders.py index a35d25848..28545a661 100644 --- a/montblanc/impl/rime/v5/loaders/loaders.py +++ b/montblanc/impl/rime/v5/loaders/loaders.py @@ -335,4 +335,4 @@ def __enter__(solver): return super(MeasurementSetLoader,solver).__enter__() def __exit__(solver, type, value, traceback): - return super(MeasurementSetLoader,solver).__exit__(type,value,traceback) \ No newline at end of file + return super(MeasurementSetLoader,solver).__exit__(type,value,traceback) From a3b51fd3f38f0d354202c51ee19beb2f95e6391b Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Thu, 11 May 2017 17:25:49 +0100 Subject: [PATCH 31/33] allowed solver reading context from the slvr_cfg --- montblanc/factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/montblanc/factory.py b/montblanc/factory.py index eae4c4b54..3be22fd04 100644 --- a/montblanc/factory.py +++ b/montblanc/factory.py @@ -168,7 +168,8 @@ def rime_solver(slvr_cfg): elif version == Options.VERSION_FIVE: from montblanc.impl.rime.v5.CompositeRimeSolver \ import CompositeRimeSolver as RimeSolver - slvr_cfg[Options.CONTEXT] = __contexts + if slvr_cfg.get(Options.CONTEXT, None) is None: + slvr_cfg[Options.CONTEXT] = __contexts else: raise ValueError('Invalid version %s' % version) From fc46cbb53927be44636aa4c7819e72adc6eb14a9 Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Tue, 19 Sep 2017 13:35:25 +0100 Subject: [PATCH 32/33] bugs fixed: E_beam and G_term matrix default, RimeBSqrt function for npol=1 --- montblanc/impl/rime/v4/config.py | 4 ++-- montblanc/impl/rime/v4/gpu/RimeBSqrt.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py index bf801d3ed..4c7cb6a3c 100644 --- a/montblanc/impl/rime/v4/config.py +++ b/montblanc/impl/rime/v4/config.py @@ -181,14 +181,14 @@ class Classifier(Enum): ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 'npol'), 'ct', classifiers=frozenset([Classifier.E_BEAM_INPUT]), - default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 + default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 else np.array([1])[np.newaxis,np.newaxis,:]), test=lambda slvr, ary: rary(ary)), # Direction-Independent Effects ary_dict('G_term', ('ntime', 'na', 'nchan', 'npol'), 'ct', classifiers=frozenset([Classifier.COHERENCIES_INPUT]), - default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 + default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 else np.array([1])[np.newaxis,np.newaxis,:]), test=lambda slvr, ary: rary(ary)), diff --git a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py index 1e56313fe..0c6df45f3 100644 --- a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py +++ b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py @@ -169,7 +169,7 @@ // Read in the stokes parameter, // multiplying it by the power term - typename Tr::ft pol = stokes[i]*power; + typename Tr::ft pol = 2.*stokes[i]*power; // include both the equal terms XX and YY // Write out the square root of the brightness i = (SRC*NTIME + TIME)*NPOLCHAN + POLCHAN; From 4d06dbdb37d0dfd4f618992e02a14609cf831cff Mon Sep 17 00:00:00 2001 From: Marzia Rivi Date: Fri, 22 Sep 2017 16:39:02 +0100 Subject: [PATCH 33/33] bug fixed for the case npol=1: final reduction didn't take into account the case for a single channel where num threadIdx.x = 1 < 3 --- montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py index 6138019e7..f1389cb01 100644 --- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py +++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py @@ -333,10 +333,10 @@ class SumCohTraits { __syncthreads(); //atomic addition to avoid concurrent access in the device memory (contribution for a single particle) // 3 different threads writes each a different component to avoid serialisation - if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0) + if (threadIdx.x == 0 && threadIdx.y < 3 && threadIdx.z == 0) { - i = SRC - SRC_START + threadIdx.x*DEXT(nssrc); - atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]); + i = SRC - SRC_START + threadIdx.y*DEXT(nssrc); + atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.y]); } }