From be1c515a336a21d8687375bc86902ed76eea2bd7 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Fri, 15 Jul 2016 14:09:06 +0100
Subject: [PATCH 01/33] GPU chi squared gradient with respect ro Sersic
 parameters

---
 .../rime/v4/gpu/SersicChiSquaredGradient.py   | 473 ++++++++++++++++++
 1 file changed, 473 insertions(+)
 create mode 100644 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
new file mode 100644
index 000000000..8e6bafa40
--- /dev/null
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi, Simon Perkins
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import string
+
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+from pycuda.compiler import SourceModule
+
+import montblanc
+import montblanc.util as mbu
+from montblanc.node import Node
+
+
+from montblanc.config import RimeSolverConfig as Options
+
+FLOAT_PARAMS = {
+    'BLOCKDIMX': 4,    # Number of channels*4 polarisations
+    'BLOCKDIMY': 25,     # Number of baselines
+    'BLOCKDIMZ': 10,     # Number of timesteps
+    'maxregs': 48       # Maximum number of registers
+}
+
+DOUBLE_PARAMS = {
+    'BLOCKDIMX': 4,    # Number of channels*4 polarisations
+    'BLOCKDIMY': 25,     # Number of baselines
+    'BLOCKDIMZ': 10,     # Number of timesteps
+    'maxregs': 63       # Maximum number of registers
+}
+
+KERNEL_TEMPLATE = string.Template("""
+#include <stdint.h>
+#include <cstdio>
+#include \"math_constants.h\"
+#include <montblanc/include/abstraction.cuh>
+#include <montblanc/include/jones.cuh>
+
+#define BLOCKDIMX ${BLOCKDIMX}
+#define BLOCKDIMY ${BLOCKDIMY}
+#define BLOCKDIMZ ${BLOCKDIMZ}
+
+#define TWO_PI_OVER_C ${two_pi_over_c}
+
+template <typename T>
+class SumCohTraits {};
+
+template <>
+class SumCohTraits<float> {
+public:
+    typedef float3 UVWType;
+};
+
+template <>
+class SumCohTraits<double> {
+public:
+    typedef double3 UVWType;
+};
+
+// Here, the definition of the
+// rime_const_data struct
+// is inserted into the template
+// An area of constant memory
+// containing an instance of this
+// structure is declared. 
+${rime_const_data_struct}
+__constant__ rime_const_data C;
+#define LEXT(name) C.name.lower_extent
+#define UEXT(name) C.name.upper_extent
+#define DEXT(name) (C.name.upper_extent - C.name.lower_extent)
+
+#define NA (C.na.local_size)
+#define NBL (C.nbl.local_size)
+#define NCHAN (C.nchan.local_size)
+#define NTIME (C.ntime.local_size)
+#define NPSRC (C.npsrc.local_size)
+#define NGSRC (C.ngsrc.local_size)
+#define NSSRC (C.nssrc.local_size)
+#define NSRC (C.nnsrc.local_size)
+#define NPOL (C.npol.local_size)
+#define NPOLCHAN (C.npolchan.local_size)
+
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                             (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+
+template <
+    typename T,
+    bool apply_weights=false,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void sersic_chi_squared_gradient_impl(
+    typename SumCohTraits<T>::UVWType * uvw,
+    typename Tr::ft * sersic_shape,
+    typename Tr::ft * frequency,
+    int * antenna1,
+    int * antenna2,
+    typename Tr::ct * jones,
+    uint8_t * flag,
+    typename Tr::ft * weight_vector,
+    typename Tr::ct * observed_vis,
+    typename Tr::ct * G_term,
+    typename Tr::ct * visibilities,
+    typename Tr::ft * X2_grad)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int BL = blockIdx.y*blockDim.y + threadIdx.y;
+    int TIME = blockIdx.z*blockDim.z + threadIdx.z;
+
+    if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan))
+        return;
+
+    __shared__ struct {
+        typename SumCohTraits<T>::UVWType uvw[BLOCKDIMZ][BLOCKDIMY];
+
+        T e1;
+        T e2;
+        T sersic_scale;
+
+        T X2_grad_part[3];
+
+        T freq[BLOCKDIMX];
+
+    } shared;
+
+    typename Tr::ct dev_vis[3];
+
+    T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
+    T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
+    T & W = shared.uvw[threadIdx.z][threadIdx.y].z; 
+
+    int i;
+
+    // Figure out the antenna pairs
+    i = TIME*NBL + BL;   
+    int ANT1 = antenna1[i];
+    int ANT2 = antenna2[i];
+
+    // UVW coordinates vary by baseline and time, but not polarised channel
+    if(threadIdx.x == 0)
+    {
+        // UVW, calculated from u_pq = u_p - u_q
+        i = TIME*NA + ANT1;
+        shared.uvw[threadIdx.z][threadIdx.y] = uvw[i];
+
+        i = TIME*NA + ANT2;
+        typename SumCohTraits<T>::UVWType ant2_uvw = uvw[i];
+        U -= ant2_uvw.x;
+        V -= ant2_uvw.y;
+        W -= ant2_uvw.z;
+    }
+
+    // Wavelength varies by channel, but not baseline and time
+    // TODO uses 4 times the actually required space, since
+    // we don't need to store a frequency per polarisation
+    if(threadIdx.y == 0 && threadIdx.z == 0)
+        { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+
+    i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN;
+    typename Tr::ct delta = observed_vis[i];
+    delta.x -= visibilities[i].x;
+    delta.y -= visibilities[i].y;
+
+    // Zero the polarisation if it is flagged
+    if(flag[i] > 0)
+    {
+        delta.x = 0; delta.y = 0;
+    }
+
+    // Apply any necessary weighting factors
+    if(apply_weights)
+    {
+        T w = weight_vector[i];
+        delta.x *= w;
+        delta.y *= w;
+    }
+
+    int SRC_START = DEXT(npsrc) + DEXT(ngsrc);
+    int SRC_STOP = SRC_START + DEXT(nssrc);
+
+    // Loop over Sersic Sources
+    for(int SRC = SRC_START; SRC < SRC_STOP; ++SRC)
+    {
+        // sersic shape only varies by source. Shape parameters
+        // thus apply to the entire block and we can load them with
+        // only the first thread.
+        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+        {
+            i = SRC - SRC_START;                 shared.e1 = sersic_shape[i];
+            i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
+            i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
+
+            shared.X2_grad_part[0] = 0.;
+            shared.X2_grad_part[1] = 0.;
+            shared.X2_grad_part[2] = 0.;
+        }
+
+        __syncthreads();
+
+        // Create references to a
+        // complicated part of shared memory
+        const T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
+        const T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
+
+        // sersic source in  the Fourier domain
+        T u1 = U*(T(1.0)+shared.e1) + V*shared.e2;
+        T v1 = U*shared.e2 + V*(T(1.0)-shared.e1);
+
+        T sersic_factor = T(1.0)-shared.e1*shared.e1-shared.e2*shared.e2;
+        T dev_e1 = u1*(U*sersic_factor + 2*shared.e1*u1);
+        dev_e1 += v1*(2*shared.e1*v1 - V*sersic_factor);
+        T dev_e2 = u1*(V*sersic_factor + 2*shared.e2*u1);
+        dev_e2 += v1*(U*sersic_factor + 2*shared.e2*v1);
+
+        // pay attention to the instructions order because temp and sersic_factor variables are utilised twice!!!
+        T temp = T(TWO_PI_OVER_C)*shared.freq[threadIdx.x]*shared.sersic_scale;
+        temp /= sersic_factor;
+        temp *= temp;
+        dev_e1 *= temp/sersic_factor;
+        dev_e2 *= temp/sersic_factor;     
+   
+        temp *= (u1*u1+v1*v1);
+        sersic_factor = T(1.0)+temp;
+        dev_e1 /= sersic_factor;
+        dev_e2 /= sersic_factor;
+
+        T dev_scale = temp/(shared.sersic_scale*sersic_factor);
+
+        // Get the complex scalars for antenna two and multiply
+        // in the exponent term
+        // Get the complex scalar for antenna one and conjugate it
+        i = ((SRC*NTIME + TIME)*NA + ANT1)*NPOLCHAN + POLCHAN;
+        typename Tr::ct ant_one = jones[i];
+        sersic_factor = T(1.0) / (sersic_factor*Po::sqrt(sersic_factor));
+        ant_one.x *= sersic_factor;
+        ant_one.y *= sersic_factor;
+        i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
+        typename Tr::ct ant_two = jones[i];
+        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+
+        dev_vis[0].x = ant_one.x*dev_e1;
+        dev_vis[0].y = ant_one.y*dev_e1;
+        dev_vis[1].x = ant_one.x*dev_e2;
+        dev_vis[1].y = ant_one.y*dev_e2;
+        dev_vis[2].x = ant_one.x*dev_scale;
+        dev_vis[2].y = ant_one.y*dev_scale;
+  /* 
+        for (int p = 0; p < 3; p++)
+        {
+          // Multiply the visibility derivative by antenna 1's g term
+          i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN;
+          typename Tr::ct ant1_g_term = G_term[i];
+          montblanc::jones_multiply_4x4_in_place<T>(ant1_g_term, dev_vis[p]);
+
+          // Multiply the visibility by antenna 2's g term
+          i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN;
+          typename Tr::ct ant2_g_term = G_term[i];
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant1_g_term, ant2_g_term);    
+          dev_vis[p] = ant1_g_term;
+        }
+   */
+    
+        // Write partial derivative with respect to sersic parameters
+        for (int p=0; p<3; p++)
+        {
+          dev_vis[p].x *= delta.x;
+          dev_vis[p].y *= delta.y;
+
+          typename Tr::ct other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 2);
+          // Add polarisations 2 and 3 to 0 and 1
+          if((POLCHAN & 0x3) < 2)
+          {
+            dev_vis[p].x += other.x;
+            dev_vis[p].y += other.y;
+          }
+          other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 1);
+
+          // If this is the polarisation 0, add polarisation 1
+          // and write out this chi squared grad term
+          if((POLCHAN & 0x3) == 0) 
+          {
+            dev_vis[p].x += other.x;
+            dev_vis[p].y += other.y;
+
+            //atomic addition to avoid concurrent access in the shared memory
+            dev_vis[p].x += dev_vis[p].y;
+            atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x);
+          }
+        }
+        __syncthreads();
+        
+        //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
+        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+        {
+            i = SRC - DEXT(npsrc) - DEXT(ngsrc);
+            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[0]);
+            i += DEXT(nssrc);
+            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[1]);
+            i += DEXT(nssrc);
+            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[2]);
+        }
+    }
+}
+
+extern "C" {
+
+// Macro that stamps out different kernels, depending
+// on whether we're handling floats or doubles
+// Arguments
+// - ft: The floating point type. Should be float/double.
+// - ct: The complex type. Should be float2/double2.
+// - apply_weights: boolean indicating whether we're weighting our visibilities
+
+#define stamp_sersic_chi_squared_gradient_fn(ft, ct, uvwt, apply_weights) \
+__global__ void \
+sersic_chi_squared_gradient( \
+    uvwt * uvw, \
+    ft * sersic_shape, \
+    ft * frequency, \
+    int * antenna1, \
+    int * antenna2, \
+    ct * jones, \
+    uint8_t * flag,\
+    ft * weight_vector, \
+    ct * observed_vis, \
+    ct * G_term, \
+    ct * visibilities, \
+    ft * X2_grad) \
+{ \
+    sersic_chi_squared_gradient_impl<ft, apply_weights>(uvw, sersic_shape, \
+        frequency, antenna1, antenna2, jones, flag,\
+        weight_vector, observed_vis, G_term, \
+        visibilities, X2_grad); \
+}
+
+${stamp_function}
+
+} // extern "C" {
+""")
+
+class SersicChiSquaredGradient(Node):
+    def __init__(self, weight_vector=False):
+        super(SersicChiSquaredGradient, self).__init__()
+        self.weight_vector = weight_vector
+
+    def initialise(self, solver, stream=None):
+        slvr = solver
+        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
+
+          # Get a property dictionary off the solver
+        D = slvr.template_dict()
+        # Include our kernel parameters
+        D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS)
+        D['rime_const_data_struct'] = slvr.const_data().string_def()
+
+        D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \
+            mbu.redistribute_threads(D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'],
+            npolchan, nbl, ntime)
+
+        regs = str(FLOAT_PARAMS['maxregs'] \
+            if slvr.is_float() else DOUBLE_PARAMS['maxregs'])
+
+        # Create the signature of the call to the function stamping macro
+        stamp_args = ', '.join([
+            'float' if slvr.is_float() else 'double',
+            'float2' if slvr.is_float() else 'double2',
+            'float3' if slvr.is_float() else 'double3',
+            'true' if slvr.use_weight_vector() else 'false'])
+        stamp_fn = ''.join(['stamp_sersic_chi_squared_gradient_fn(', stamp_args, ')'])
+        D['stamp_function'] = stamp_fn
+
+        kname = 'sersic_chi_squared_gradient'
+
+        self.mod = SourceModule(
+            KERNEL_TEMPLATE.substitute(**D),
+            options=['-lineinfo','-maxrregcount', regs],
+            include_dirs=[montblanc.get_source_path()],
+            no_extern_c=True)
+
+        self.rime_const_data_gpu = self.mod.get_global('C')
+        self.kernel = self.mod.get_function(kname)
+        self.launch_params = self.get_launch_params(slvr, D)
+
+    def shutdown(self, solver, stream=None):
+        pass
+
+    def pre_execution(self, solver, stream=None):
+        pass
+
+    def get_launch_params(self, slvr, D):
+        polchans_per_block = D['BLOCKDIMX']
+        bl_per_block = D['BLOCKDIMY']
+        times_per_block = D['BLOCKDIMZ']
+
+        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
+        polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block)
+        bl_blocks = mbu.blocks_required(nbl, bl_per_block)
+        time_blocks = mbu.blocks_required(ntime, times_per_block)
+
+        return {
+            'block' : (polchans_per_block, bl_per_block, times_per_block),
+            'grid'  : (polchan_blocks, bl_blocks, time_blocks),
+        }
+
+    def execute(self, solver, stream=None):
+        slvr = solver
+        nssrc = slvr.dim_local_size('nssrc')
+
+        if stream is not None:
+            cuda.memcpy_htod_async(
+                self.rime_const_data_gpu[0],
+                slvr.const_data().ndary(),
+                stream=stream)
+        else:
+            cuda.memcpy_htod(
+                self.rime_const_data_gpu[0],
+                slvr.const_data().ndary())
+
+        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
+            else slvr.sersic_shape
+       
+        if slvr.is_float(): 
+            gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32)
+        else:
+            gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64)
+
+        self.kernel(slvr.uvw, sersic,
+            slvr.frequency, slvr.antenna1, slvr.antenna2,
+            slvr.jones, slvr.flag, slvr.weight_vector,
+            slvr.observed_vis, slvr.G_term,
+            slvr.model_vis, gradient,
+            stream=stream, **self.launch_params)
+
+        slvr.X2_grad = 6*gradient.get().reshape(3,nssrc)
+
+        if not self.weight_vector:
+            slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd
+
+    def post_execution(self, solver, stream=None):
+        pass

From 9a5623a2eaea3d434c8378dc334cb4767157bf48 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Fri, 15 Jul 2016 14:10:30 +0100
Subject: [PATCH 02/33] CPU chi squared gradient with respect ro Sersic
 parameters

---
 montblanc/impl/rime/v4/cpu/CPUSolver.py | 202 +++++++++++++++++++++++-
 1 file changed, 199 insertions(+), 3 deletions(-)

diff --git a/montblanc/impl/rime/v4/cpu/CPUSolver.py b/montblanc/impl/rime/v4/cpu/CPUSolver.py
index 5026d3d21..621e245bb 100644
--- a/montblanc/impl/rime/v4/cpu/CPUSolver.py
+++ b/montblanc/impl/rime/v4/cpu/CPUSolver.py
@@ -650,6 +650,144 @@ def compute_gekb_vis(self, ekb_vis=None):
 
         return result
 
+
+    def compute_sersic_derivatives(self):
+        """
+        Compute the partial derivative values for the sersic (exponential) sources.
+        Returns a (nssrc, ntime, nbl, nchan) matrix of floating point scalars.
+        """
+
+        nssrc, ntime, nbl, nchan  = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan')
+        ant0, ant1 = self.ap_idx()
+
+        # Calculate per baseline u from per antenna u
+        up, uq = self.uvw[:,:,0][ant0], self.uvw[:,:,0][ant1]
+        u = ne.evaluate('uq-up', {'up': up, 'uq': uq})
+
+        # Calculate per baseline v from per antenna v
+        vp, vq = self.uvw[:,:,1][ant0], self.uvw[:,:,1][ant1]
+        v = ne.evaluate('vq-vp', {'vp': vp, 'vq': vq})
+
+        # Calculate per baseline w from per antenna w
+        wp, wq = self.uvw[:,:,2][ant0], self.uvw[:,:,2][ant1]
+        w = ne.evaluate('wq-wp', {'wp': wp, 'wq': wq})
+
+        e1 = self.sersic_shape[0]
+        e2 = self.sersic_shape[1]
+        R = self.sersic_shape[2]
+
+        # OK, try obtain the same results with the fwhm factored out!
+        # u1 = u*(1+e1) - v*e2
+        # v1 = u*e2 + v*(1-e1)
+        u1 = ne.evaluate('u_1_e1 + v_e2',
+            { 'u_1_e1': np.outer(1+e1, u), 'v_e2' : np.outer(e2, v)})
+        v1 = ne.evaluate('u_e2 + v_1_e1', 
+            { 'u_e2' : np.outer(e2, u), 'v_1_e1' : np.outer(1-e1,v)})
+
+        assert u1.shape == (nssrc, ntime*nbl)
+        assert v1.shape == (nssrc, ntime*nbl)
+
+        e12 = 1 - e1 * e1 - e2 * e2
+
+        de1 = ne.evaluate('u1*(u_e12+2*e1*u1) + v1*(2*e1*v1-v_e12)',
+            local_dict={
+                'u1'   : u1,  'v1': v1,\
+                'u_e12': np.outer(e12, u),\
+                'v_e12': np.outer(e12, v),\
+                'e1': e1[:,np.newaxis]}).reshape(nssrc, ntime,nbl)
+
+        de2 = ne.evaluate('u1*(v_e12+2*e2*u1) + v1*(2*e2*v1+u_e12)',
+            local_dict={
+                'u1'   : u1,  'v1': v1,\
+                'u_e12': np.outer(e12, u),\
+                'v_e12': np.outer(e12, v),\
+                'e2': e2[:,np.newaxis]}).reshape(nssrc, ntime,nbl)
+
+        wavenum = (self.two_pi_over_c * self.frequency)
+
+        temp = ne.evaluate('(wavenum*R/e12)**2',
+            {
+              'R'  : R[:,np.newaxis],\
+              'e12': e12[:,np.newaxis],\
+              'wavenum': wavenum[np.newaxis,:]}).reshape(nssrc,nchan)
+
+        sfactor = ne.evaluate('1+temp*(u1*u1+v1*v1)',
+            {
+              'u1' : u1[:,:,np.newaxis],\
+              'v1' : v1[:,:,np.newaxis],\
+              'temp'  : temp[:,np.newaxis,:]}).reshape(nssrc,ntime,nbl,nchan)
+
+        sersic_deriv = np.empty((3,nssrc,ntime,nbl,nchan))
+          
+        # partial derivatives with respect to e1 and e2
+        sersic_deriv[0] = ne.evaluate('de1*temp/(sfactor*e12)',
+            { 
+              'de1' : de1[:,:,:,np.newaxis],\
+              'temp': temp[:,np.newaxis,np.newaxis,:],\
+              'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        sersic_deriv[1] = ne.evaluate('de2*temp/(sfactor*e12)',
+            {
+              'de2' : de2[:,:,:,np.newaxis],\
+              'temp': temp[:,np.newaxis,np.newaxis,:], \
+              'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        # partial derivatives with respect to scalelength
+        sersic_deriv[2] = ne.evaluate('(sfactor-1)/(sfactor*R)',
+            {
+              'R' : R[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor' : sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        return sersic_deriv
+
+
+
+
+    def compute_gekb_vis_grad(self, ekb_jones=None):
+        """
+        Computes the gradient of the complex visibilities with respect to the sersic parameters.
+        Returns a (3,nssrc,ntime,nbl,nchan,4) matrix of complex scalars.
+        """
+
+        nssrc, npsrc, ngsrc, nsrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'npsrc', 'ngsrc', 'nsrc', 'ntime', 'nbl', 'nchan')
+
+        if ekb_jones is None:
+            ekb_jones = self.compute_ekb_jones_per_bl()
+
+        want_shape = (nsrc, ntime, nbl, nchan, 4)
+        assert ekb_jones.shape == want_shape, \
+            'Expected shape %s. Got %s instead.' % \
+            (want_shape, ekb_jones.shape)
+
+        ant0, ant1 = self.ap_idx(chan=True)
+        g_term_p = self.G_term[ant0]
+        g_term_q = self.G_term[ant1]
+
+        assert g_term_p.shape == (ntime, nbl, nchan, 4)
+
+        gekb_jones = np.empty([nssrc,ntime,nbl,nchan,4])
+        for i in range(nssrc):
+            gekb_jones[i] = (self.jones_multiply(g_term_p, ekb_jones[i])
+                .reshape(ntime, nbl, nchan, 4))
+            gekb_jones[i] = (self.jones_multiply(gekb_jones[i], g_term_q, hermitian=True)
+                .reshape(ntime, nbl, nchan, 4))
+
+        sersic_deriv = self.compute_sersic_derivatives()
+
+        src_beg = npsrc + ngsrc
+        src_end = npsrc + ngsrc + nssrc
+        vis_grad = np.empty((3,nssrc,ntime,nbl,nchan,4),dtype = np.complex)
+        vis_grad[0] = gekb_jones[src_beg:src_end]*sersic_deriv[0,:,:,:,:,np.newaxis]
+        vis_grad[1] = gekb_jones[src_beg:src_end]*sersic_deriv[1,:,:,:,:,np.newaxis]
+        vis_grad[2] = gekb_jones[src_beg:src_end]*sersic_deriv[2,:,:,:,:,np.newaxis]
+
+        assert vis_grad.shape == (3, nssrc, ntime, nbl, nchan, 4)
+
+        return vis_grad
+
+
     def compute_chi_sqrd_sum_terms(self, vis=None):
         """
         Computes the terms of the chi squared sum,
@@ -711,14 +849,72 @@ def compute_chi_sqrd(self, chi_sqrd_terms=None):
         return (term_sum if self.use_weight_vector() is True
             else term_sum / self.sigma_sqrd)
 
+
+    def compute_chi_sqrd_gradient(self, vis=None, vis_grad=None):
+        """ Computes the chi squared gradient with respect to Sersic parameters.
+        Parameters:
+            weight_vector : boolean
+                True if the chi squared test
+                should be computed with a noise vector
+        Returns a (3,nssrc) matrix of real values
+        """
+        nssrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan')
+
+        # Do the chi squared gradient on the CPU.
+        # If we're not using the weight vector, sum and
+        # divide by the sigma squared.
+        # Otherwise, simply return the sum
+        if vis is None: vis = self.compute_gekb_vis()
+        if vis_grad is None: vis_grad = self.compute_gekb_vis_grad()
+
+
+        # Compute the residuals if this has not yet happened
+        if not self.outputs_residuals():
+            d = ne.evaluate('(ovis - mvis)*where(flag > 0, 0, 1)', {
+                'mvis': vis,
+                'ovis': self.observed_vis,
+                'flag' : self.flag })
+            assert d.shape == (ntime, nbl, nchan, 4)
+        else:
+            d = vis
+
+        re = d.real
+        im = d.imag
+        wv = self.weight_vector
+
+        # Multiply by the weight vector if required
+        if self.use_weight_vector() is True:
+            ne.evaluate('re*wv', {'re': re, 'wv': wv}, out=re)
+            ne.evaluate('im*wv', {'im': im, 'wv': wv}, out=im)       
+
+        # Multiply by the visibilities gradient
+        for p in xrange(3):
+            for s in xrange(nssrc):
+                vis_grad[p,s].real = ne.evaluate('re*dre', {'re': re, 'dre': vis_grad[p,s].real})
+                vis_grad[p,s].imag = ne.evaluate('im*dim', {'im': im, 'dim': vis_grad[p,s].imag})
+
+        # Sum the real and imaginary terms together
+        # for the final result.
+        re_sum = ne.evaluate('sum(re,5)', {'re': vis_grad.real})
+        im_sum = ne.evaluate('sum(im,5)', {'im': vis_grad.imag})
+        chi_sqrd_grad_terms = ne.evaluate('re_sum + im_sum',
+            {'re_sum': re_sum, 'im_sum': im_sum})
+        assert chi_sqrd_grad_terms.shape == (3, nssrc, ntime, nbl, nchan)
+
+        self.X2_grad = 6*np.sum(chi_sqrd_grad_terms,(2,3,4)).reshape(3,nssrc)
+           
+        return (self.X2_grad if self.use_weight_vector() is True \
+            else self.X2_grad / self.sigma_sqrd)
+
+
     def solve(self):
         """ Solve the RIME """
 
         self.jones[:] = self.compute_ekb_sqrt_jones_per_ant()
         
-        self.vis[:] = self.compute_gekb_vis()
+        self.model_vis[:] = self.compute_gekb_vis()
         
         self.chi_sqrd_result[:] = self.compute_chi_sqrd_sum_terms(
-            self.vis)
+            self.model_vis)
         
-        self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result))
\ No newline at end of file
+        self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result))

From 667aec15154f2f77ada0051d004a5db3e2f540a1 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Fri, 15 Jul 2016 14:11:50 +0100
Subject: [PATCH 03/33] added chi squared gradient option

---
 montblanc/impl/rime/v4/RimeSolver.py | 18 ++++++++++++++----
 montblanc/impl/rime/v4/config.py     |  2 ++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/montblanc/impl/rime/v4/RimeSolver.py b/montblanc/impl/rime/v4/RimeSolver.py
index 895e7f30e..fc3d519ad 100644
--- a/montblanc/impl/rime/v4/RimeSolver.py
+++ b/montblanc/impl/rime/v4/RimeSolver.py
@@ -30,14 +30,24 @@
 from montblanc.impl.rime.v4.gpu.RimeBSqrt import RimeBSqrt
 from montblanc.impl.rime.v4.gpu.RimeEKBSqrt import RimeEKBSqrt
 from montblanc.impl.rime.v4.gpu.RimeSumCoherencies import RimeSumCoherencies
+from montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient
 
 from montblanc.pipeline import Pipeline
 
 def get_pipeline(slvr_cfg):
-    return Pipeline([RimeBSqrt(),
-        RimeEBeam(),
-        RimeEKBSqrt(),
-        RimeSumCoherencies()])
+    sersic_grad = slvr_cfg[Options.SERSIC_GRADIENT]
+
+    if (sersic_grad):
+        return Pipeline([RimeBSqrt(),
+            RimeEBeam(),
+            RimeEKBSqrt(),
+            RimeSumCoherencies(),
+            SersicChiSquaredGradient()])
+    else:
+        return Pipeline([RimeBSqrt(),
+            RimeEBeam(),
+            RimeEKBSqrt(),
+            RimeSumCoherencies()])
 
 class RimeSolver(MontblancCUDASolver):
     """ RIME Solver Implementation """
diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index 95f13a970..158a93fe0 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -235,4 +235,6 @@ class Classifier(Enum):
         classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])),
     ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
+    ary_dict('X2_grad', (3,'nssrc',), 'ft',
+        classifiers=frozenset([Classifier.GPU_SCRATCH])),
 ]

From ddc2f0a1a309de48405277ca70c0d75827ea756a Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Fri, 15 Jul 2016 14:12:35 +0100
Subject: [PATCH 04/33] added chi squared gradient option

---
 montblanc/impl/rime/slvr_config.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/montblanc/impl/rime/slvr_config.py b/montblanc/impl/rime/slvr_config.py
index d5bae57b7..556534ed4 100644
--- a/montblanc/impl/rime/slvr_config.py
+++ b/montblanc/impl/rime/slvr_config.py
@@ -63,6 +63,14 @@ class RimeSolverConfig(SolverConfig):
         "If True, chi-squared terms are weighted with a vectorised sigma. "
         "If False, chi-squared terms are weighted with a single scalar sigma.")
 
+    # Enabling chi squared gradient computation with respect Sersic parameters
+    SERSIC_GRADIENT = 'sersic_gradient'
+    DEFAULT_SERSIC_GRADIENT = False
+    VALID_SERSIC_GRADIENT = [True, False]
+    SERSIC_GRADIENT_DESCRIPTION = (
+        "If True, chi_squared gradient is computed with respect to the sersic parameters for each source.")
+
+
     # weight vector initialisation keyword and valid values
     # This SolverConfig determines whether
     INIT_WEIGHTS = 'init_weights'
@@ -198,6 +206,13 @@ class RimeSolverConfig(SolverConfig):
             SolverConfig.DEFAULT: DEFAULT_E_BEAM_DEPTH,
             SolverConfig.REQUIRED: True
         },
+
+        SERSIC_GRADIENT: {
+            SolverConfig.DESCRIPTION: SERSIC_GRADIENT_DESCRIPTION,
+            SolverConfig.VALID: VALID_SERSIC_GRADIENT,
+            SolverConfig.DEFAULT: DEFAULT_SERSIC_GRADIENT,
+            SolverConfig.REQUIRED: True
+        },
     }
 
     def parser(self):
@@ -221,6 +236,12 @@ def parser(self):
             help=self.E_BEAM_DEPTH_DESCRIPTION,
             default=self.DEFAULT_E_BEAM_DEPTH)
 
+        p.add_argument('--{v}'.format(v=self.SERSIC_GRADIENT),
+            required=False,
+            type=bool,
+            help=self.SERSIC_GRADIENT_DESCRIPTION,
+            default=self.DEFAULT_SERSIC_GRADIENT)
+
         p.add_argument('--{v}'.format(v=self.WEIGHT_VECTOR),
             required=False,
             type=bool,
@@ -274,4 +295,4 @@ def parser(self):
             help=self.VERSION_DESCRIPTION,
             default=self.DEFAULT_VERSION)
 
-        return p
\ No newline at end of file
+        return p

From 21dc3af4d8a6a42dec1a44a2ddcbed1a14afe02a Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Mon, 18 Jul 2016 18:05:40 +0100
Subject: [PATCH 05/33] Sersic gradient example

---
 montblanc/examples/MS_gradient_example.py | 140 ++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 montblanc/examples/MS_gradient_example.py

diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py
new file mode 100644
index 000000000..c1f51f8dc
--- /dev/null
+++ b/montblanc/examples/MS_gradient_example.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Compare analytical chi squared gradient computation with respect to 
+# Sersic parameters with the numerical gradent computation. 
+# CPU and GPU solvers are also compared
+ 
+# Input: filename - Measurement Set file
+#        ns - number of sersic sources
+
+import sys
+import argparse
+import math
+import numpy as np
+import montblanc
+import montblanc.util as mbu
+
+from  montblanc.config import RimeSolverConfig as Options
+
+ARCS2RAD = np.pi/648000.
+
+#scalelength in arcsec
+minscale = 0.3*ARCS2RAD
+maxscale = 3.5*ARCS2RAD
+
+parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
+parser.add_argument('msfile', help='Input MS filename')
+parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
+
+args = parser.parse_args(sys.argv[1:])
+
+# Get the RIME solver
+# Enable gradient computation: sersic_gradient=True
+slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
+        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
+        init_weights=None, weight_vector=False,  
+        sersic_gradient=True, dtype='double', version='v4')
+
+with montblanc.rime_solver(slvr_cfg) as slvr:
+
+    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
+    
+# Random source coordinates in the l,m (brightness image) domain
+    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
+    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
+    lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
+    slvr.transfer_lm(lm)
+
+# Brightness matrix for sources
+    stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype)
+    I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3]
+    I[:] = np.ones(shape=I.shape)*70.
+    Q[:] = np.zeros(shape=Q.shape)
+    U[:] = np.zeros(shape=U.shape)
+    V[:] = np.zeros(shape=V.shape)
+    slvr.transfer_stokes(stokes)
+    alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
+    slvr.transfer_alpha(alpha)
+    
+    # If there are sersic sources, create their
+    # shape matrix and transfer it.
+    mod = slvr.ft(np.random.random(nssrc))*0.3
+    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
+    e1 = mod*np.sin(angle)
+    e2 = mod*np.cos(angle)
+    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
+    sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
+    slvr.transfer_sersic_shape(sersic_shape)
+    print sersic_shape
+
+    #Set visibility noise variance (muJy)
+    time_acc = 60
+    efficiency = 0.9
+    channel_bandwidth_hz = 20e6
+    SEFD = 400e6
+    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
+    slvr.set_sigma_sqrd(sigma)
+
+
+    # Create observed data and upload it to the GPU
+    slvr.solve()
+    with slvr.context:
+        observed = slvr.retrieve_model_vis()
+
+    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noise = noiseR +1j*noiseI
+    observed = observed+noise
+    print 'transfer to GPU'
+    slvr.transfer_observed_vis(observed)
+ 
+    print slvr
+
+    slvr.solve() 
+    dq_gpu = slvr.X2_grad  
+    print "analytical GPU: ",dq_gpu
+
+    # Execute the pipeline
+    slvr.solve()
+    l1 = slvr.X2
+    dq = np.empty([3,nssrc])
+    for i in xrange(nssrc):
+        e1_inc = sersic_shape.copy()
+        e1_inc[0,i] = e1_inc[0,i]+0.000001
+        slvr.transfer_sersic_shape(e1_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[0,i] = (l2-l1)*1e6
+        e2_inc = sersic_shape.copy()
+        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
+        slvr.transfer_sersic_shape(e2_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[1,i] = (l2-l1)*1e6
+        R_inc = sersic_shape.copy()
+        R_inc[2,i] = R_inc[2,i]+0.00000001
+        slvr.transfer_sersic_shape(R_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[2,i] = (l2-l1)*1e8
+    print "numeric",dq
+
+

From 05433fdb66d980520b42879b08f9b6450d6600c4 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Mon, 18 Jul 2016 18:06:42 +0100
Subject: [PATCH 06/33] Sersic gradient example

---
 montblanc/examples/MS_gradient_example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py
index c1f51f8dc..041405521 100644
--- a/montblanc/examples/MS_gradient_example.py
+++ b/montblanc/examples/MS_gradient_example.py
@@ -20,7 +20,6 @@
 
 # Compare analytical chi squared gradient computation with respect to 
 # Sersic parameters with the numerical gradent computation. 
-# CPU and GPU solvers are also compared
  
 # Input: filename - Measurement Set file
 #        ns - number of sersic sources

From 7541c011557a6b998963fac3b962aba228151430 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Wed, 20 Jul 2016 16:56:39 +0100
Subject: [PATCH 07/33] uncommented multiplication by the G term

---
 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 8e6bafa40..b017cb502 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -277,7 +277,7 @@ class SumCohTraits<double> {
         dev_vis[1].y = ant_one.y*dev_e2;
         dev_vis[2].x = ant_one.x*dev_scale;
         dev_vis[2].y = ant_one.y*dev_scale;
-  /* 
+   
         for (int p = 0; p < 3; p++)
         {
           // Multiply the visibility derivative by antenna 1's g term
@@ -291,7 +291,7 @@ class SumCohTraits<double> {
           montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant1_g_term, ant2_g_term);    
           dev_vis[p] = ant1_g_term;
         }
-   */
+   
     
         // Write partial derivative with respect to sersic parameters
         for (int p=0; p<3; p++)

From b59b4e0af2abf44fb2ad8144b553cb8c53795d80 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <marzia.rivi@gmail.com>
Date: Thu, 28 Jul 2016 16:52:10 +0100
Subject: [PATCH 08/33] added minor optimisations

---
 .../rime/v4/gpu/SersicChiSquaredGradient.py   | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index b017cb502..1e927e19c 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -222,11 +222,9 @@ class SumCohTraits<double> {
             i = SRC - SRC_START;                 shared.e1 = sersic_shape[i];
             i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
             i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
-
-            shared.X2_grad_part[0] = 0.;
-            shared.X2_grad_part[1] = 0.;
-            shared.X2_grad_part[2] = 0.;
         }
+        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3)
+            shared.X2_grad_part[threadIdx.z] = 0.;
 
         __syncthreads();
 
@@ -277,7 +275,7 @@ class SumCohTraits<double> {
         dev_vis[1].y = ant_one.y*dev_e2;
         dev_vis[2].x = ant_one.x*dev_scale;
         dev_vis[2].y = ant_one.y*dev_scale;
-   
+   /*
         for (int p = 0; p < 3; p++)
         {
           // Multiply the visibility derivative by antenna 1's g term
@@ -292,7 +290,7 @@ class SumCohTraits<double> {
           dev_vis[p] = ant1_g_term;
         }
    
-    
+    */
         // Write partial derivative with respect to sersic parameters
         for (int p=0; p<3; p++)
         {
@@ -323,14 +321,11 @@ class SumCohTraits<double> {
         __syncthreads();
         
         //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
-        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+        // 3 different threads writes each a different component to avoid serialisation
+        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3)
         {
-            i = SRC - DEXT(npsrc) - DEXT(ngsrc);
-            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[0]);
-            i += DEXT(nssrc);
-            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[1]);
-            i += DEXT(nssrc);
-            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[2]);
+            i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.z*DEXT(nssrc);
+            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.z]);
         }
     }
 }
@@ -378,7 +373,7 @@ def __init__(self, weight_vector=False):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
+        nssrc, ntime, nbl, npolchan = slvr.dim_local_size('nssrc','ntime', 'nbl', 'npolchan')
 
           # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -410,6 +405,11 @@ def initialise(self, solver, stream=None):
             include_dirs=[montblanc.get_source_path()],
             no_extern_c=True)
 
+        if slvr.is_float():
+            self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32)
+        else:
+            self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64)
+
         self.rime_const_data_gpu = self.mod.get_global('C')
         self.kernel = self.mod.get_function(kname)
         self.launch_params = self.get_launch_params(slvr, D)
@@ -451,20 +451,17 @@ def execute(self, solver, stream=None):
 
         sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
             else slvr.sersic_shape
-       
-        if slvr.is_float(): 
-            gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32)
-        else:
-            gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64)
+
+        self.gradient.fill(0.) 
 
         self.kernel(slvr.uvw, sersic,
             slvr.frequency, slvr.antenna1, slvr.antenna2,
             slvr.jones, slvr.flag, slvr.weight_vector,
             slvr.observed_vis, slvr.G_term,
-            slvr.model_vis, gradient,
+            slvr.model_vis, self.gradient,
             stream=stream, **self.launch_params)
 
-        slvr.X2_grad = 6*gradient.get().reshape(3,nssrc)
+        slvr.X2_grad = 6*((self.gradient).get().reshape(3,nssrc))
 
         if not self.weight_vector:
             slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd

From 6ad91b90c0b1704d19a071a29bb3f1cd01abd3b6 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Mon, 1 Aug 2016 11:56:39 +0100
Subject: [PATCH 09/33] move gradient elements product by 6 in the kernel

---
 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 1e927e19c..a4bb3b007 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -315,6 +315,7 @@ class SumCohTraits<double> {
 
             //atomic addition to avoid concurrent access in the shared memory
             dev_vis[p].x += dev_vis[p].y;
+            dev_vis[p].x *= 6;
             atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x);
           }
         }
@@ -322,10 +323,10 @@ class SumCohTraits<double> {
         
         //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
         // 3 different threads writes each a different component to avoid serialisation
-        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3)
+        if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
         {
-            i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.z*DEXT(nssrc);
-            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.z]);
+            i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.x*DEXT(nssrc);
+            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
         }
     }
 }
@@ -461,7 +462,7 @@ def execute(self, solver, stream=None):
             slvr.model_vis, self.gradient,
             stream=stream, **self.launch_params)
 
-        slvr.X2_grad = 6*((self.gradient).get().reshape(3,nssrc))
+        slvr.X2_grad = (self.gradient).get().reshape(3,nssrc)
 
         if not self.weight_vector:
             slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd

From d083bd57d53ca550342d7ac6003b32986ff9f817 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Mon, 1 Aug 2016 14:47:50 +0100
Subject: [PATCH 10/33] changed X2_grad classifier, now SIMULATOR_OUTPUT

---
 montblanc/impl/rime/v4/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index 158a93fe0..edf14a349 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -236,5 +236,5 @@ class Classifier(Enum):
     ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
     ary_dict('X2_grad', (3,'nssrc',), 'ft',
-        classifiers=frozenset([Classifier.GPU_SCRATCH])),
+        classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])),
 ]

From 32734c20ddb47c13ab22e409669d1660f4c392a8 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Mon, 1 Aug 2016 15:46:39 +0100
Subject: [PATCH 11/33] minor changes

---
 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index a4bb3b007..be4ae2fd2 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -407,9 +407,9 @@ def initialise(self, solver, stream=None):
             no_extern_c=True)
 
         if slvr.is_float():
-            self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float32)
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32)
         else:
-            self.gradient = gpuarray.zeros((3*nssrc,),dtype=np.float64)
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64)
 
         self.rime_const_data_gpu = self.mod.get_global('C')
         self.kernel = self.mod.get_function(kname)
@@ -438,7 +438,6 @@ def get_launch_params(self, slvr, D):
 
     def execute(self, solver, stream=None):
         slvr = solver
-        nssrc = slvr.dim_local_size('nssrc')
 
         if stream is not None:
             cuda.memcpy_htod_async(
@@ -462,7 +461,7 @@ def execute(self, solver, stream=None):
             slvr.model_vis, self.gradient,
             stream=stream, **self.launch_params)
 
-        slvr.X2_grad = (self.gradient).get().reshape(3,nssrc)
+        slvr.X2_grad = self.gradient.get() 
 
         if not self.weight_vector:
             slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd

From 30f1001f85331b6f6b366cd72a7add373b8954d4 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Tue, 2 Aug 2016 12:00:03 +0100
Subject: [PATCH 12/33] minor changes

---
 .../MS_single_gpu_gradient_example.py         | 134 ++++++++++++++++++
 .../rime/v4/gpu/SersicChiSquaredGradient.py   |   6 +-
 2 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 montblanc/examples/MS_single_gpu_gradient_example.py

diff --git a/montblanc/examples/MS_single_gpu_gradient_example.py b/montblanc/examples/MS_single_gpu_gradient_example.py
new file mode 100644
index 000000000..fe3a61e4d
--- /dev/null
+++ b/montblanc/examples/MS_single_gpu_gradient_example.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Compare analytical chi squared gradient computation with respect to 
+# Sersic parameters with the numerical gradent computation. 
+ 
+# Input: filename - Measurement Set file
+#        ns - number of sersic sources
+
+import sys
+import argparse
+import math
+import numpy as np
+import montblanc
+import montblanc.util as mbu
+
+from  montblanc.config import RimeSolverConfig as Options
+
+ARCS2RAD = np.pi/648000.
+
+#scalelength in arcsec
+minscale = 0.3*ARCS2RAD
+maxscale = 3.5*ARCS2RAD
+
+parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
+parser.add_argument('msfile', help='Input MS filename')
+parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
+
+args = parser.parse_args(sys.argv[1:])
+
+# Get the RIME solver
+# Enable gradient computation: sersic_gradient=True
+slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
+        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
+        init_weights=None, weight_vector=False,  
+        sersic_gradient=True, dtype='double', version='v4')
+
+with montblanc.rime_solver(slvr_cfg) as slvr:
+
+    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
+    np.random.seed(14352)
+    
+    # Random source coordinates in the l,m (brightness image) domain
+    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
+    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
+    lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
+    slvr.transfer_lm(lm)
+
+    # Brightness matrix for sources
+    stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype)
+    I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3]
+    I[:] = np.ones(shape=I.shape)*70.
+    Q[:] = np.zeros(shape=Q.shape)
+    U[:] = np.zeros(shape=U.shape)
+    V[:] = np.zeros(shape=V.shape)
+    slvr.transfer_stokes(stokes)
+    alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
+    slvr.transfer_alpha(alpha)
+    
+    # If there are sersic sources, create their
+    # shape matrix and transfer it.
+    mod = slvr.ft(np.random.random(nssrc))*0.3
+    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
+    e1 = mod*np.sin(angle)
+    e2 = mod*np.cos(angle)
+    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
+    sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
+    slvr.transfer_sersic_shape(sersic_shape)
+    print sersic_shape
+
+    #Set visibility noise variance (muJy)
+    time_acc = 60
+    efficiency = 0.9
+    channel_bandwidth_hz = 20e6
+    SEFD = 400e6
+    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
+    slvr.set_sigma_sqrd(sigma)
+
+    # Create observed data and upload it to the GPU
+    slvr.solve()
+    with slvr.context:
+        observed = slvr.retrieve_model_vis()
+
+    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noise = noiseR +1j*noiseI
+    observed = observed+noise
+    slvr.transfer_observed_vis(observed)
+ 
+    slvr.solve() 
+
+    print slvr
+    print "analytical: ",slvr.X2_grad
+
+    # Execute the pipeline
+    l1 = slvr.X2
+    dq = np.empty([3,nssrc])
+    for i in xrange(nssrc):
+        e1_inc = sersic_shape.copy()
+        e1_inc[0,i] = e1_inc[0,i]+0.000001
+        slvr.transfer_sersic_shape(e1_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[0,i] = (l2-l1)*1e6
+        e2_inc = sersic_shape.copy()
+        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
+        slvr.transfer_sersic_shape(e2_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[1,i] = (l2-l1)*1e6
+        R_inc = sersic_shape.copy()
+        R_inc[2,i] = R_inc[2,i]+0.00000001
+        slvr.transfer_sersic_shape(R_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[2,i] = (l2-l1)*1e8
+    print "numeric: ",dq
diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index be4ae2fd2..33e9061be 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -411,7 +411,7 @@ def initialise(self, solver, stream=None):
         else:
             self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64)
 
-        self.rime_const_data_gpu = self.mod.get_global('C')
+        self.rime_const_data = self.mod.get_global('C')
         self.kernel = self.mod.get_function(kname)
         self.launch_params = self.get_launch_params(slvr, D)
 
@@ -441,12 +441,12 @@ def execute(self, solver, stream=None):
 
         if stream is not None:
             cuda.memcpy_htod_async(
-                self.rime_const_data_gpu[0],
+                self.rime_const_data[0],
                 slvr.const_data().ndary(),
                 stream=stream)
         else:
             cuda.memcpy_htod(
-                self.rime_const_data_gpu[0],
+                self.rime_const_data[0],
                 slvr.const_data().ndary())
 
         sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \

From e0d046700614badded753bbb9f19a082e4f8f004 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Tue, 2 Aug 2016 14:46:16 +0100
Subject: [PATCH 13/33] fixed bug #152 as in the master merge #156

---
 montblanc/examples/MS_gradient_example.py     | 139 ------------------
 montblanc/impl/rime/v5/CompositeRimeSolver.py |  96 ++++++++++--
 2 files changed, 80 insertions(+), 155 deletions(-)
 delete mode 100644 montblanc/examples/MS_gradient_example.py

diff --git a/montblanc/examples/MS_gradient_example.py b/montblanc/examples/MS_gradient_example.py
deleted file mode 100644
index 041405521..000000000
--- a/montblanc/examples/MS_gradient_example.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2016 Marzia Rivi
-#
-# This file is part of montblanc.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
-
-# Compare analytical chi squared gradient computation with respect to 
-# Sersic parameters with the numerical gradent computation. 
- 
-# Input: filename - Measurement Set file
-#        ns - number of sersic sources
-
-import sys
-import argparse
-import math
-import numpy as np
-import montblanc
-import montblanc.util as mbu
-
-from  montblanc.config import RimeSolverConfig as Options
-
-ARCS2RAD = np.pi/648000.
-
-#scalelength in arcsec
-minscale = 0.3*ARCS2RAD
-maxscale = 3.5*ARCS2RAD
-
-parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
-parser.add_argument('msfile', help='Input MS filename')
-parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
-
-args = parser.parse_args(sys.argv[1:])
-
-# Get the RIME solver
-# Enable gradient computation: sersic_gradient=True
-slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
-        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
-        init_weights=None, weight_vector=False,  
-        sersic_gradient=True, dtype='double', version='v4')
-
-with montblanc.rime_solver(slvr_cfg) as slvr:
-
-    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
-    
-# Random source coordinates in the l,m (brightness image) domain
-    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
-    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
-    lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
-    slvr.transfer_lm(lm)
-
-# Brightness matrix for sources
-    stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype)
-    I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3]
-    I[:] = np.ones(shape=I.shape)*70.
-    Q[:] = np.zeros(shape=Q.shape)
-    U[:] = np.zeros(shape=U.shape)
-    V[:] = np.zeros(shape=V.shape)
-    slvr.transfer_stokes(stokes)
-    alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
-    slvr.transfer_alpha(alpha)
-    
-    # If there are sersic sources, create their
-    # shape matrix and transfer it.
-    mod = slvr.ft(np.random.random(nssrc))*0.3
-    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
-    e1 = mod*np.sin(angle)
-    e2 = mod*np.cos(angle)
-    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
-    sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
-    slvr.transfer_sersic_shape(sersic_shape)
-    print sersic_shape
-
-    #Set visibility noise variance (muJy)
-    time_acc = 60
-    efficiency = 0.9
-    channel_bandwidth_hz = 20e6
-    SEFD = 400e6
-    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
-    slvr.set_sigma_sqrd(sigma)
-
-
-    # Create observed data and upload it to the GPU
-    slvr.solve()
-    with slvr.context:
-        observed = slvr.retrieve_model_vis()
-
-    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
-    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
-    noise = noiseR +1j*noiseI
-    observed = observed+noise
-    print 'transfer to GPU'
-    slvr.transfer_observed_vis(observed)
- 
-    print slvr
-
-    slvr.solve() 
-    dq_gpu = slvr.X2_grad  
-    print "analytical GPU: ",dq_gpu
-
-    # Execute the pipeline
-    slvr.solve()
-    l1 = slvr.X2
-    dq = np.empty([3,nssrc])
-    for i in xrange(nssrc):
-        e1_inc = sersic_shape.copy()
-        e1_inc[0,i] = e1_inc[0,i]+0.000001
-        slvr.transfer_sersic_shape(e1_inc)
-        slvr.solve()
-        l2 = slvr.X2
-        dq[0,i] = (l2-l1)*1e6
-        e2_inc = sersic_shape.copy()
-        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
-        slvr.transfer_sersic_shape(e2_inc)
-        slvr.solve()
-        l2 = slvr.X2
-        dq[1,i] = (l2-l1)*1e6
-        R_inc = sersic_shape.copy()
-        R_inc[2,i] = R_inc[2,i]+0.00000001
-        slvr.transfer_sersic_shape(R_inc)
-        slvr.solve()
-        l2 = slvr.X2
-        dq[2,i] = (l2-l1)*1e8
-    print "numeric",dq
-
-
diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 3ff158978..f859d5c8c 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -126,7 +126,9 @@ def __init__(self, slvr_cfg):
         # before throttling is applied
         self.throttle_factor = slvr_cfg.get(
             Options.VISIBILITY_THROTTLE_FACTOR)
-
+        # Enabling chi squared gradient computation
+        self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT)
+     
         # Massage the contexts for each device into a list
         if not isinstance(self.dev_ctxs, list):
             self.dev_ctxs = [self.dev_ctxs]
@@ -926,6 +928,31 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                     subslvr, kernel.rime_const_data[0]))
                 kernel.execute(subslvr, subslvr.stream)
 
+            # Iterate again for gradient computation requiring visibilities computed
+            # for all source chuncks
+            if self.enable_sersic_grad:
+                for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices():
+                    # Update our maps with source slice information
+                    cpu_slice_map.update(src_cpu_slice_map)
+                    gpu_slice_map.update(src_gpu_slice_map)
+
+                    # Configure dimension extents and global size on the sub-solver
+                    for name, slice_ in cpu_slice_map.iteritems():
+                        subslvr.update_dimension(name=name,
+                            global_size=self.dimension(name).global_size,
+                            lower_extent=slice_.start,
+                            upper_extent=slice_.stop)
+    
+                    # Enqueue Sersic gradient
+                    kernel = subslvr.rime_sersic_gradient
+                    pool_refs.extend(self._enqueue_array(
+                        subslvr, cpu_slice_map, gpu_slice_map,
+                        direction=ASYNC_HTOD, dirty=dirty,
+                        classifiers=[Classifier.COHERENCIES_INPUT]))
+                    pool_refs.append(self._enqueue_const_data_htod(
+                        subslvr, kernel.rime_const_data[0]))
+                    kernel.execute(subslvr, subslvr.stream)
+
             # Enqueue chi-squared term reduction and return the
             # GPU array allocated to it
             X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream)
@@ -943,8 +970,13 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                 classifiers=[Classifier.SIMULATOR_OUTPUT])
 
         # Should only be model visibilities
-        assert len(refs) == 1, ('Expected one array (model visibilities), '
-            'received {l} instead.'.format(l=len(refs)))
+        if self.enable_sersic_grad:
+            assert len(refs) == 2, ('Expected two arrays (model visibilities, X2_grad), '
+                'received {l} instead.'.format(l=len(refs)))
+            X2_grad = refs[1]
+        else:
+            assert len(refs) == 1, ('Expected one array (model visibilities), '
+                'received {l} instead.'.format(l=len(refs)))
 
         model_vis = refs[0]
 
@@ -959,11 +991,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         pool_refs.append(X2_gpu_ary)
         pool_refs.append(sub_X2)
         pool_refs.append(model_vis)
-
-        return (sync_event, sub_X2, model_vis,
-            pool_refs, subslvr.pool_lock,
-            cpu_slice_map.copy(),
-            gpu_slice_map.copy())
+        if self.enable_sersic_grad:
+            pool_refs.append(X2_grad)
+            return (sync_event, sub_X2, model_vis, X2_grad,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
+        else:
+            return (sync_event, sub_X2, model_vis,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
 
     def initialise(self):
         """ Initialise the sub-solver """
@@ -1009,8 +1047,12 @@ def _sync_wait(future):
             Return a copy of the pinned chi-squared, pinned model visibilities
             and index into the CPU array after synchronizing on the cuda_event
             """
-            cuda_event, pinned_X2, pinned_model_vis, \
-                pool_refs, pool_lock, cpu, gpu = future.result()
+            if self.enable_sersic_grad:
+                cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\
+                    pool_refs, pool_lock, cpu, gpu = future.result()
+            else:
+                cuda_event, pinned_X2, pinned_model_vis, \
+                    pool_refs, pool_lock, cpu, gpu = future.result()
 
             try:
                 cuda_event.synchronize()
@@ -1031,12 +1073,22 @@ def _sync_wait(future):
             model_vis_idx = [cpu[s] if s in cpu else ALL_SLICE
                 for s in model_vis_sshape]
 
+            # Infer proper model visibility shape, this may
+            # have been squeezed when enqueueing the slice
+            model_vis_shape = tuple(cpu[s].stop - cpu[s].start
+                if s in cpu else s for s in model_vis_sshape)
+
             X2 = pinned_X2.copy()
-            model_vis = pinned_model_vis.copy()
+            model_vis = pinned_model_vis.copy().reshape(model_vis_shape)
+            if self.enable_sersic_grad:
+                X2_grad = pinned_X2_grad.copy()
 
             _free_pool_allocs(pool_refs, pool_lock)
 
-            return X2, model_vis, model_vis_idx
+            if self.enable_sersic_grad:
+                return X2, model_vis, model_vis_idx, X2_grad
+            else:
+                return X2, model_vis, model_vis_idx
 
         # For easier typing
         C = CompositeRimeSolver
@@ -1048,6 +1100,9 @@ def _sync_wait(future):
 
         # Running sum of the chi-squared values returned in futures
         X2_sum = self.ft(0.0)
+        if self.enable_sersic_grad:
+            nssrc = self.dim_local_size('nssrc')
+            self.X2_grad = self.ft(np.zeros((3,nssrc)))
 
         # Sets of return value futures for each executor
         value_futures = [set() for ex in self.enqueue_executors]
@@ -1118,8 +1173,12 @@ def _accumulate(model_slice, model_chunk):
                         for s in value_futures:
                             s.discard(f)
 
-                        # Get chi-squared and model visibilities
-                        X2, pinned_model_vis, model_vis_idx = f.result()
+                        # Get chi-squared and model visibilities (and X2 gradient)
+                        if self.enable_sersic_grad:
+                            X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                            self.X2_grad += pinned_X2_grad
+                        else:
+                            X2, pinned_model_vis, model_vis_idx = f.result()
                         # Sum X2
                         X2_sum += X2
                         # Write model visibilities to the numpy array
@@ -1137,8 +1196,12 @@ def _accumulate(model_slice, model_chunk):
 
         # Sum remaining chi-squared
         for f in done:
-            # Get chi-squared and model visibilities
-            X2, pinned_model_vis, model_vis_idx = f.result()
+            # Get chi-squared and model visibilities (and X2 gradient)
+            if self.enable_sersic_grad:
+                X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                self.X2_grad += pinned_X2_grad
+            else:
+                X2, pinned_model_vis, model_vis_idx = f.result()
             # Sum X2
             X2_sum += X2
             # Write model visibilities to the numpy array
@@ -1150,6 +1213,7 @@ def _accumulate(model_slice, model_chunk):
             self.set_X2(X2_sum)
         else:
             self.set_X2(X2_sum/self.sigma_sqrd)
+            self.X2_grad = self.X2_grad/self.sigma_sqrd
 
     def shutdown(self):
         """ Shutdown the solver """

From 0761ef53e433c276cc674da30cdd1d16de884601 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 3 Aug 2016 16:55:59 +0100
Subject: [PATCH 14/33] created multi-gpu gradient example

---
 .../MS_single_node_gradient_example.py        | 130 ++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 montblanc/examples/MS_single_node_gradient_example.py

diff --git a/montblanc/examples/MS_single_node_gradient_example.py b/montblanc/examples/MS_single_node_gradient_example.py
new file mode 100644
index 000000000..41637fa76
--- /dev/null
+++ b/montblanc/examples/MS_single_node_gradient_example.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Compare analytical chi squared gradient computation with respect to 
+# Sersic parameters with the numerical gradent computation. 
+ 
+# Input: filename - Measurement Set file
+#        ns - number of sersic sources
+
+import sys
+import argparse
+import math
+import numpy as np
+import montblanc
+import montblanc.util as mbu
+
+from  montblanc.config import RimeSolverConfig as Options
+
+ARCS2RAD = np.pi/648000.
+
+#scalelength in arcsec
+minscale = 0.3*ARCS2RAD
+maxscale = 3.5*ARCS2RAD
+
+parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
+parser.add_argument('msfile', help='Input MS filename')
+parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
+
+args = parser.parse_args(sys.argv[1:])
+
+# Get the RIME solver
+# Enable gradient computation: sersic_gradient=True
+slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
+        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
+        init_weights=None, weight_vector=False,  
+        sersic_gradient=True, dtype='double', version='v5')
+
+with montblanc.rime_solver(slvr_cfg) as slvr:
+
+    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
+    np.random.seed(14352)    
+
+    # Random source coordinates in the l,m (brightness image) domain
+    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
+    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
+    slvr.lm[:] = mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
+
+    # Brightness matrix for sources
+    I, Q, U, V = slvr.stokes[:,:,0], slvr.stokes[:,:,1], slvr.stokes[:,:,2], slvr.stokes[:,:,3]
+    I[:] = np.ones(shape=I.shape)*70.
+    Q[:] = np.zeros(shape=Q.shape)
+    U[:] = np.zeros(shape=U.shape)
+    V[:] = np.zeros(shape=V.shape)
+    slvr.alpha[:] = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
+    
+    # If there are sersic sources, create their
+    # shape matrix and transfer it.
+    mod = slvr.ft(np.random.random(nssrc))*0.3
+    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
+    e1 = mod*np.sin(angle)
+    e2 = mod*np.cos(angle)
+    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
+    slvr.sersic_shape[:] = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
+    print slvr.sersic_shape
+
+    #Set visibility noise variance (muJy)
+    time_acc = 60
+    efficiency = 0.9
+    channel_bandwidth_hz = 20e6
+    SEFD = 400e6
+    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
+    slvr.set_sigma_sqrd(sigma)
+
+    # Create observed data and upload it to the GPU
+    slvr.solve()
+    observed = slvr.model_vis[:].copy() 
+
+    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noise = noiseR +1j*noiseI
+    slvr.observed_vis[:] = (observed+noise).copy()
+ 
+    slvr.solve()
+
+    print slvr 
+    print "analytical: ",slvr.X2_grad
+
+    # Execute the pipeline
+    l1 = slvr.X2
+    sersic_shape = slvr.sersic_shape.copy()
+    dq = np.empty([3,nssrc])
+    for i in xrange(nssrc):
+        e1_inc = sersic_shape.copy()
+        e1_inc[0,i] = e1_inc[0,i]+0.000001
+        slvr.sersic_shape[:] = e1_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[0,i] = (l2-l1)*1e6
+        e2_inc = sersic_shape.copy()
+        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
+        slvr.sersic_shape[:] = e2_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[1,i] = (l2-l1)*1e6
+        R_inc = sersic_shape.copy()
+        R_inc[2,i] = R_inc[2,i]+0.00000001
+        slvr.sersic_shape[:] = R_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[2,i] = (l2-l1)*1e8
+    print "numeric: ",dq
+
+

From 0e4e4a15356f4cdd79a84ab3e1da15e9f0ab1142 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 3 Aug 2016 17:21:02 +0100
Subject: [PATCH 15/33] add X2 gradient computation within multi-gpu solver

---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 29 ++++++++++---------
 montblanc/impl/rime/v5/RimeSolver.py          | 12 +++++++-
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index f859d5c8c..46d99fcce 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -943,15 +943,15 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                             lower_extent=slice_.start,
                             upper_extent=slice_.stop)
     
-                    # Enqueue Sersic gradient
-                    kernel = subslvr.rime_sersic_gradient
-                    pool_refs.extend(self._enqueue_array(
-                        subslvr, cpu_slice_map, gpu_slice_map,
-                        direction=ASYNC_HTOD, dirty=dirty,
-                        classifiers=[Classifier.COHERENCIES_INPUT]))
-                    pool_refs.append(self._enqueue_const_data_htod(
-                        subslvr, kernel.rime_const_data[0]))
-                    kernel.execute(subslvr, subslvr.stream)
+                # Enqueue Sersic gradient
+                kernel = subslvr.rime_sersic_gradient
+                pool_refs.extend(self._enqueue_array(
+                    subslvr, cpu_slice_map, gpu_slice_map,
+                    direction=ASYNC_HTOD, dirty=dirty,
+                    classifiers=[Classifier.COHERENCIES_INPUT]))
+                pool_refs.append(self._enqueue_const_data_htod(
+                    subslvr, kernel.rime_const_data[0]))
+                kernel.execute(subslvr, subslvr.stream)
 
             # Enqueue chi-squared term reduction and return the
             # GPU array allocated to it
@@ -1102,7 +1102,7 @@ def _sync_wait(future):
         X2_sum = self.ft(0.0)
         if self.enable_sersic_grad:
             nssrc = self.dim_local_size('nssrc')
-            self.X2_grad = self.ft(np.zeros((3,nssrc)))
+            X2_grad = self.ft(np.zeros((3,nssrc)))
 
         # Sets of return value futures for each executor
         value_futures = [set() for ex in self.enqueue_executors]
@@ -1141,6 +1141,9 @@ def _accumulate(model_slice, model_chunk):
                     if nvalues > self.throttle_factor:
                         continue
 
+                    if self.enable_sersic_grad:
+                        self.X2_grad = self.ft(np.zeros((3,nssrc)))
+
                     # Enqueue CUDA operations for solving
                     # this visibility chunk. 
                     enqueue_future = enq_ex.submit(
@@ -1176,7 +1179,7 @@ def _accumulate(model_slice, model_chunk):
                         # Get chi-squared and model visibilities (and X2 gradient)
                         if self.enable_sersic_grad:
                             X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                            self.X2_grad += pinned_X2_grad
+                            X2_grad += pinned_X2_grad
                         else:
                             X2, pinned_model_vis, model_vis_idx = f.result()
                         # Sum X2
@@ -1199,7 +1202,7 @@ def _accumulate(model_slice, model_chunk):
             # Get chi-squared and model visibilities (and X2 gradient)
             if self.enable_sersic_grad:
                 X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                self.X2_grad += pinned_X2_grad
+                X2_grad += pinned_X2_grad
             else:
                 X2, pinned_model_vis, model_vis_idx = f.result()
             # Sum X2
@@ -1213,7 +1216,7 @@ def _accumulate(model_slice, model_chunk):
             self.set_X2(X2_sum)
         else:
             self.set_X2(X2_sum/self.sigma_sqrd)
-            self.X2_grad = self.X2_grad/self.sigma_sqrd
+            self.X2_grad = X2_grad/self.sigma_sqrd
 
     def shutdown(self):
         """ Shutdown the solver """
diff --git a/montblanc/impl/rime/v5/RimeSolver.py b/montblanc/impl/rime/v5/RimeSolver.py
index 7d7192ee0..32b41d7cf 100644
--- a/montblanc/impl/rime/v5/RimeSolver.py
+++ b/montblanc/impl/rime/v5/RimeSolver.py
@@ -31,6 +31,7 @@
 from montblanc.impl.rime.v5.gpu.RimeEKBSqrt import RimeEKBSqrt
 from montblanc.impl.rime.v5.gpu.RimeSumCoherencies import RimeSumCoherencies
 from montblanc.impl.rime.v5.gpu.RimeReduction import RimeReduction
+from montblanc.impl.rime.v5.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient
 
 from montblanc.impl.rime.v4.RimeSolver import RimeSolver as RimeSolverV4
 
@@ -64,11 +65,14 @@ def __init__(self, slvr_cfg):
             slvr_cfg[Options.E_BEAM_DEPTH],
             description='E cube nu depth')
 
+        self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT)
+
         self.rime_e_beam = RimeEBeam()
         self.rime_b_sqrt = RimeBSqrt()
         self.rime_ekb_sqrt = RimeEKBSqrt()
         self.rime_sum = RimeSumCoherencies()
         self.rime_reduce = RimeReduction()
+        self.rime_sersic_gradient = SersicChiSquaredGradient()
 
         from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs
         monkey_patch_antenna_pairs(self)
@@ -136,6 +140,8 @@ def initialise(self):
             self.rime_ekb_sqrt.initialise(self)
             self.rime_sum.initialise(self)
             self.rime_reduce.initialise(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.initialise(self)
 
     def solve(self):
         with self.context:
@@ -144,6 +150,8 @@ def solve(self):
             self.rime_ekb_sqrt.execute(self)
             self.rime_sum.execute(self)
             self.rime_reduce.execute(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.execute(self)
 
     def shutdown(self):
         with self.context:
@@ -151,4 +159,6 @@ def shutdown(self):
             self.rime_b_sqrt.shutdown(self)
             self.rime_ekb_sqrt.shutdown(self)
             self.rime_sum.shutdown(self)
-            self.rime_reduce.shutdown(self)
\ No newline at end of file
+            self.rime_reduce.shutdown(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.shutdown(self)

From 6f7e2c0c7b15715572eb17942bc48d1513b7d847 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 3 Aug 2016 17:22:02 +0100
Subject: [PATCH 16/33] add X2 gradient computation version 5

---
 .../rime/v5/gpu/SersicChiSquaredGradient.py   | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py

diff --git a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
new file mode 100644
index 000000000..0d93b9062
--- /dev/null
+++ b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+
+import montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient
+
+class SersicChiSquaredGradient(montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient.SersicChiSquaredGradient):
+    def __init__(self):
+        super(SersicChiSquaredGradient, self).__init__()
+    def initialise(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).initialise(solver,stream)
+    def shutdown(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).shutdown(solver,stream)
+    def pre_execution(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).pre_execution(solver,stream)
+
+        if stream is not None:
+            cuda.memcpy_htod_async(
+                self.rime_const_data[0],
+                solver.const_data().ndary(),
+                stream=stream)
+        else:
+            cuda.memcpy_htod(
+                self.rime_const_data[0],
+                solver.const_data().ndary())
+
+    def post_execution(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).post_execution(solver,stream)
+
+    def execute(self, solver, stream=None):
+        slvr = solver
+
+        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
+            else slvr.sersic_shape
+
+        print slvr.X2_grad
+        self.kernel(slvr.uvw, sersic,
+            slvr.frequency, slvr.antenna1, slvr.antenna2,
+            slvr.jones, slvr.flag, slvr.weight_vector,
+            slvr.observed_vis, slvr.G_term,
+            slvr.model_vis, slvr.X2_grad,
+            stream=stream, **self.launch_params)
+        print slvr.X2_grad

From 11b85ea258fa2dce9e4fed1850b1dbfd0707cf66 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 3 Aug 2016 20:34:27 +0100
Subject: [PATCH 17/33] removed print instructions

---
 montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
index 0d93b9062..70d5c0445 100644
--- a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
@@ -54,11 +54,9 @@ def execute(self, solver, stream=None):
         sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
             else slvr.sersic_shape
 
-        print slvr.X2_grad
         self.kernel(slvr.uvw, sersic,
             slvr.frequency, slvr.antenna1, slvr.antenna2,
             slvr.jones, slvr.flag, slvr.weight_vector,
             slvr.observed_vis, slvr.G_term,
             slvr.model_vis, slvr.X2_grad,
             stream=stream, **self.launch_params)
-        print slvr.X2_grad

From 7a959880c0f7e8e049427ec26544dbe85d466ffa Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 3 Aug 2016 20:43:59 +0100
Subject: [PATCH 18/33] fixed misalignment

---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 46d99fcce..aac4ddcb4 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -935,7 +935,7 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                     # Update our maps with source slice information
                     cpu_slice_map.update(src_cpu_slice_map)
                     gpu_slice_map.update(src_gpu_slice_map)
-
+           
                     # Configure dimension extents and global size on the sub-solver
                     for name, slice_ in cpu_slice_map.iteritems():
                         subslvr.update_dimension(name=name,
@@ -943,15 +943,15 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                             lower_extent=slice_.start,
                             upper_extent=slice_.stop)
     
-                # Enqueue Sersic gradient
-                kernel = subslvr.rime_sersic_gradient
-                pool_refs.extend(self._enqueue_array(
-                    subslvr, cpu_slice_map, gpu_slice_map,
-                    direction=ASYNC_HTOD, dirty=dirty,
-                    classifiers=[Classifier.COHERENCIES_INPUT]))
-                pool_refs.append(self._enqueue_const_data_htod(
-                    subslvr, kernel.rime_const_data[0]))
-                kernel.execute(subslvr, subslvr.stream)
+                    # Enqueue Sersic gradient
+                    kernel = subslvr.rime_sersic_gradient
+                    pool_refs.extend(self._enqueue_array(
+                        subslvr, cpu_slice_map, gpu_slice_map,
+                        direction=ASYNC_HTOD, dirty=dirty,
+                        classifiers=[Classifier.COHERENCIES_INPUT]))
+                    pool_refs.append(self._enqueue_const_data_htod(
+                        subslvr, kernel.rime_const_data[0]))
+                    kernel.execute(subslvr, subslvr.stream)
 
             # Enqueue chi-squared term reduction and return the
             # GPU array allocated to it

From fc486a345892e5327a8d18b58c933685e7ff3ee1 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 17 Aug 2016 14:50:35 +0100
Subject: [PATCH 19/33] bug fixed

---
 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 33e9061be..2ed9cb437 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -223,8 +223,8 @@ class SumCohTraits<double> {
             i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
             i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
         }
-        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z < 3)
-            shared.X2_grad_part[threadIdx.z] = 0.;
+        if(threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
+            shared.X2_grad_part[threadIdx.x] = 0.;
 
         __syncthreads();
 
@@ -325,7 +325,7 @@ class SumCohTraits<double> {
         // 3 different threads writes each a different component to avoid serialisation
         if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
         {
-            i = SRC - DEXT(npsrc) - DEXT(ngsrc) + threadIdx.x*DEXT(nssrc);
+            i = SRC - SRC_START + threadIdx.x*NSSRC;
             atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
         }
     }

From 4738328ab85cb77dd2fcc2bb2feffbf7b11e9498 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 17 Aug 2016 14:52:14 +0100
Subject: [PATCH 20/33] Composite solver merged with the master

---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 195 +++++++++++-------
 1 file changed, 123 insertions(+), 72 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index aac4ddcb4..84abfcd80 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -18,6 +18,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
 
+from collections import defaultdict
 from copy import (copy as shallowcopy, deepcopy)
 import functools
 import itertools
@@ -31,6 +32,8 @@
 import pycuda.driver as cuda
 import pycuda.tools
 
+import hypercube.util as hcu
+
 import montblanc
 import montblanc.util as mbu
 
@@ -63,6 +66,10 @@
 ORDERING_RANK = [' or '.join(['nsrc'] + mbu.source_nr_vars()),
     'ntime', ' or '.join(['nbl', 'na']), 'nchan']
 
+def _update_refs(pool, new_refs):
+    for key, value in new_refs.iteritems():
+        pool[key].extend(value)
+
 class CompositeRimeSolver(MontblancNumpySolver):
     """
     Composite solver implementation for RIME.
@@ -265,8 +272,9 @@ def _gen_source_slices(self):
 
             # Get the source slice ranges for each individual
             # source type, and update the CPU dictionary with them
-            cpu_slice.update(mbu.source_range_slices(
-                src, src_end, src_nr_var_counts))
+            src_range_slices = mbu.source_range_slices(
+                src, src_end, src_nr_var_counts)
+            cpu_slice.update(src_range_slices)
 
             # and configure the same for GPU slices
             for s in src_nr_vars:
@@ -449,8 +457,8 @@ def _enqueue_array_slice(self, r, subslvr,
         cache_idx = dirty.get(r.name, None)
 
         if cache_idx and cache_idx == cpu_idx:
-            montblanc.log.debug("Cache hit on {n} index {i} "
-                    .format(n=r.name, i=cpu_idx))
+            #montblanc.log.debug("Cache hit on {n} index {i} "
+            #        .format(n=r.name, i=cpu_idx))
 
             return None
 
@@ -514,7 +522,7 @@ def _enqueue_array(self, subslvr,
         Returns a list of arrays allocated with pinned memory.
         Entries in this list may be None.
         """
-        pool_refs = []
+        pool_refs = defaultdict(list)
 
         if direction is None:
             direction = ASYNC_HTOD
@@ -569,7 +577,8 @@ def _enqueue_array(self, subslvr,
                 cpu_ary, cpu_idx, gpu_ary, tuple(gpu_idx),
                 direction, dirty)
 
-            pool_refs.append(pinned_ary)
+            if pinned_ary is not None:
+                pool_refs[r.name].append(pinned_ary)
 
         return pool_refs
 
@@ -715,6 +724,11 @@ def _thread_create_solvers(self, subslvr_cfg, P, nsolvers):
         # Mutex for guarding the memory pools
         pool_lock = threading.Lock()
 
+        # Dirty index, indicating the CPU index of the
+        # data currently on the GPU, used for avoiding
+        # array transfer
+        self.thread_local.dirty = [{} for n in range(nsolvers)]
+
         # Configure thread local storage
         # Number of solvers in this thread
         self.thread_local.nsolvers = nsolvers
@@ -781,8 +795,8 @@ def _thread_prime_memory_pools(self):
         nsrc = self.dim_local_size('nsrc')
 
         # Retain references to pool allocations
-        pinned_pool_refs = []
-        device_pool_refs = []
+        pinned_pool_refs = defaultdict(list)
+        device_pool_refs = defaultdict(list)
         pinned_allocated = 0
 
         # Class of arrays that are to be transferred
@@ -794,15 +808,15 @@ def _thread_prime_memory_pools(self):
         # Estimate number of kernels for constant data
         nkernels = len(classifiers)
 
+        # Detect already transferred array chunks
+        dirty = {}
+
         # Get the first chunk of the visibility space
         cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next()
 
         for i, subslvr in enumerate(self.thread_local.solvers):
             # For the maximum number of visibility chunks that can be enqueued
             for T in range(self.throttle_factor):
-                # Detect already transferred array chunks
-                dirty = {}
-
                 # For each source batch within the visibility chunk
                 for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices():
                     cpu_slice_map.update(cpu_src_slice_map)
@@ -814,22 +828,23 @@ def _thread_prime_memory_pools(self):
                         cpu_slice_map, gpu_slice_map, 
                         direction=ASYNC_HTOD, dirty=dirty,
                         classifiers=classifiers)
-                    pinned_allocated += sum([r.nbytes for r in refs if r is not None])
-                    pinned_pool_refs.extend(refs)
+                    pinned_allocated += sum([r.nbytes
+                        for l in refs.values() for r in l])
+                    _update_refs(pinned_pool_refs, refs)
 
                     # Allocate pinned memory for constant memory transfers
                     cdata = subslvr.const_data().ndary()
 
                     for k in range(nkernels):
-                        ref = subslvr.pinned_mem_pool.allocate(
+                        cdata_ref = subslvr.pinned_mem_pool.allocate(
                             shape=cdata.shape, dtype=cdata.dtype)
-                        pinned_allocated += ref.nbytes
-                        pinned_pool_refs.append(ref)
+                        pinned_allocated += cdata_ref.nbytes
+                        pinned_pool_refs['cdata'].append(cdata_ref)
 
                     # Allocate device memory for arrays that need to be
                     # allocated from a pool by PyCUDA's reduction kernels
                     dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes)
-                    device_pool_refs.append(dev_ary)
+                    device_pool_refs['X2_gpu'].append(dev_ary)
 
         device = self.thread_local.context.get_device()
 
@@ -838,12 +853,23 @@ def _thread_prime_memory_pools(self):
                 d=device.name(), n=mbu.fmt_bytes(pinned_allocated)))
 
         # Now force return of memory to the pools
-        for a in pinned_pool_refs:
-            if a is not None:
-                a.base.free()
+        for key, array_list in pinned_pool_refs.iteritems():
+            [a.base.free() for a in array_list]      
+
+        for array_list in device_pool_refs.itervalues():
+            [a.free() for a in array_list]
+
+
+    def _thread_start_solving(self):
+        """
+        Contains enqueue thread functionality that needs to
+        take place at the start of each solution
+        """
+
+        # Clear the dirty dictionary to force each array to be
+        # transferred at least once. e.g. the beam cube
+        [d.clear() for d in self.thread_local.dirty]
 
-        for a in device_pool_refs:
-            a.free()
 
     def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         """
@@ -860,17 +886,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         tl = self.thread_local
         i, subslvr = tl.subslvr_gen.next()
 
-        # A list of references to memory pool allocated objects
+        # A dictionary of references to memory pool allocated objects
         # ensuring that said objects remained allocated until
         # after compute has been performed. Returned from
         # this function, this object should be discarded when
         # reading the result of the enqueued operations.
-        pool_refs = []
+        pool_refs = defaultdict(list)
 
         # Cache keyed by array names and contained indices
-        # This is used to determine if the array has already
-        # been transferred to the card on this batch
-        dirty = {}
+        # This is used to avoid unnecessary CPU to GPU copies
+        # by caching the last index of the CPU array
+        dirty = tl.dirty[i]
 
         # Guard pool allocations with a coarse-grained mutex
         with subslvr.pool_lock:
@@ -890,42 +916,50 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
 
                 # Enqueue E Beam
                 kernel = subslvr.rime_e_beam
-                pool_refs.extend(self._enqueue_array(
+                new_refs = self._enqueue_array(
                     subslvr, cpu_slice_map, gpu_slice_map,
                     direction=ASYNC_HTOD, dirty=dirty,
-                    classifiers=[Classifier.E_BEAM_INPUT]))
-                pool_refs.append(self._enqueue_const_data_htod(
-                    subslvr, kernel.rime_const_data[0]))
+                    classifiers=[Classifier.E_BEAM_INPUT])
+                cdata_ref = self._enqueue_const_data_htod(
+                    subslvr, kernel.rime_const_data[0])
+                _update_refs(pool_refs, new_refs)
+                _update_refs(pool_refs, {'cdata_ebeam' : [cdata_ref]})
                 kernel.execute(subslvr, subslvr.stream)
 
                 # Enqueue B Sqrt
                 kernel = subslvr.rime_b_sqrt
-                pool_refs.extend(self._enqueue_array(
+                new_refs = self._enqueue_array(
                     subslvr, cpu_slice_map, gpu_slice_map,
                     direction=ASYNC_HTOD, dirty=dirty,
-                    classifiers=[Classifier.B_SQRT_INPUT]))
-                pool_refs.append(self._enqueue_const_data_htod(
-                    subslvr, kernel.rime_const_data[0]))
+                    classifiers=[Classifier.B_SQRT_INPUT])
+                cdata_ref = self._enqueue_const_data_htod(
+                    subslvr, kernel.rime_const_data[0])
+                _update_refs(pool_refs, new_refs)
+                _update_refs(pool_refs, {'cdata_bsqrt' : [cdata_ref]})
                 kernel.execute(subslvr, subslvr.stream)
 
                 # Enqueue EKB Sqrt
                 kernel = subslvr.rime_ekb_sqrt
-                pool_refs.extend(self._enqueue_array(
+                new_refs = self._enqueue_array(
                     subslvr, cpu_slice_map, gpu_slice_map,
                     direction=ASYNC_HTOD, dirty=dirty,
-                    classifiers=[Classifier.EKB_SQRT_INPUT]))
-                pool_refs.append(self._enqueue_const_data_htod(
-                    subslvr, kernel.rime_const_data[0]))
+                    classifiers=[Classifier.EKB_SQRT_INPUT])
+                cdata_ref = self._enqueue_const_data_htod(
+                    subslvr, kernel.rime_const_data[0])
+                _update_refs(pool_refs, new_refs)
+                _update_refs(pool_refs, {'cdata_ekb' : [cdata_ref]})
                 kernel.execute(subslvr, subslvr.stream)
 
                 # Enqueue Sum Coherencies
                 kernel = subslvr.rime_sum
-                pool_refs.extend(self._enqueue_array(
+                new_refs = self._enqueue_array(
                     subslvr, cpu_slice_map, gpu_slice_map,
                     direction=ASYNC_HTOD, dirty=dirty,
-                    classifiers=[Classifier.COHERENCIES_INPUT]))
-                pool_refs.append(self._enqueue_const_data_htod(
-                    subslvr, kernel.rime_const_data[0]))
+                    classifiers=[Classifier.COHERENCIES_INPUT])
+                cdata_ref = self._enqueue_const_data_htod(
+                    subslvr, kernel.rime_const_data[0])
+                _update_refs(pool_refs, new_refs)
+                _update_refs(pool_refs, {'cdata_coherencies' : [cdata_ref]})
                 kernel.execute(subslvr, subslvr.stream)
 
             # Iterate again for gradient computation requiring visibilities computed
@@ -945,12 +979,14 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
     
                     # Enqueue Sersic gradient
                     kernel = subslvr.rime_sersic_gradient
-                    pool_refs.extend(self._enqueue_array(
+                    new_refs = self._enqueue_array(
                         subslvr, cpu_slice_map, gpu_slice_map,
                         direction=ASYNC_HTOD, dirty=dirty,
-                        classifiers=[Classifier.COHERENCIES_INPUT]))
-                    pool_refs.append(self._enqueue_const_data_htod(
-                        subslvr, kernel.rime_const_data[0]))
+                        classifiers=[Classifier.COHERENCIES_INPUT])
+                    cdata_ref = self._enqueue_const_data_htod(
+                        subslvr, kernel.rime_const_data[0])
+                    _update_refs(pool_refs, new_refs)
+                    _update_refs(pool_refs, {'cdata_gradient' : [cdata_ref]})
                     kernel.execute(subslvr, subslvr.stream)
 
             # Enqueue chi-squared term reduction and return the
@@ -965,20 +1001,21 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream)
 
             # Enqueue transfer of simulator output (model visibilities) to the CPU
-            refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map,
-                direction=ASYNC_DTOH, dirty=dirty,
+            sim_output_refs = self._enqueue_array(subslvr, 
+                cpu_slice_map, gpu_slice_map,
+                direction=ASYNC_DTOH, dirty={},
                 classifiers=[Classifier.SIMULATOR_OUTPUT])
 
         # Should only be model visibilities
         if self.enable_sersic_grad:
-            assert len(refs) == 2, ('Expected two arrays (model visibilities, X2_grad), '
+            assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), '
                 'received {l} instead.'.format(l=len(refs)))
-            X2_grad = refs[1]
+            X2_grad = sim_output_refs['X2_grad'][0]
         else:
-            assert len(refs) == 1, ('Expected one array (model visibilities), '
+            assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), '
                 'received {l} instead.'.format(l=len(refs)))
 
-        model_vis = refs[0]
+        model_vis = sim_output_refs['model_vis'][0]
 
         # Create and record an event directly after the chi-squared copy
         # We'll synchronise on this thread in our synchronisation executor
@@ -988,11 +1025,11 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
 
         # Retain references to CPU pinned  and GPU device memory
         # until the above enqueued operations have been performed.
-        pool_refs.append(X2_gpu_ary)
-        pool_refs.append(sub_X2)
-        pool_refs.append(model_vis)
+        pool_refs['X2_gpu'].append(X2_gpu_ary)
+        pool_refs['X2_cpu'].append(sub_X2)
+        pool_refs['model_vis_output'].append(model_vis)
         if self.enable_sersic_grad:
-            pool_refs.append(X2_grad)
+            pool_refs['X2_grad'].append(X2_grad)
             return (sync_event, sub_X2, model_vis, X2_grad,
                 pool_refs, subslvr.pool_lock,
                 cpu_slice_map.copy(),
@@ -1028,19 +1065,26 @@ def _free_pool_allocs(pool_refs, pool_lock):
             import pycuda.driver as cuda
             import pycuda.gpuarray as gpuarray
 
+            cuda_types = (cuda.PooledDeviceAllocation, cuda.PooledHostAllocation)
+
+            debug_str_list = ['Pool de-allocations per array name',]
+            debug_str_list.extend('({k}, {l})'.format(k=k,l=len(v))
+                for k, v in pool_refs.iteritems())
+
+            montblanc.log.debug(' '.join(debug_str_list))
+
             with pool_lock:
-                for ref in pool_refs:
-                    if ref is None:
-                        continue
-                    elif isinstance(ref, np.ndarray):
+                for k, ref in ((k, r) for k, rl in pool_refs.iteritems()
+                                for r in rl):
+                    if isinstance(ref, np.ndarray):
                         ref.base.free()
-                    elif isinstance(ref, (cuda.PooledDeviceAllocation, cuda.PooledHostAllocation)):
+                    elif isinstance(ref, cuda_types):
                         ref.free()
                     elif isinstance(ref, gpuarray.GPUArray):
                         ref.gpudata.free()
                     else:
-                        raise TypeError("Unable to release pool allocated "
-                            "object of type {t}.".format(t=type(ref)))
+                        raise TypeError("Don't know how to release pool allocated "
+                            "object '{n}'' of type {t}.".format(n=k, t=type(ref)))
 
         def _sync_wait(future):
             """
@@ -1058,15 +1102,14 @@ def _sync_wait(future):
                 cuda_event.synchronize()
             except cuda.LogicError as e:
                 # Format the slices nicely
-                for k, s in cpu.iteritems():
-                    cpu[k] = '[{b}, {e}]'.format(b=s.start, e=s.stop)
-
-                for k, s in gpu.iteritems():
-                    gpu[k] = '[{b}, {e}]'.format(b=s.start, e=s.stop)
+                pretty_cpu = { k: '[{b}, {e}]'.format(b=s.start, e=s.stop) 
+                    for k, s in cpu.iteritems() }
+                pretty_gpu = { k: '[{b}, {e}]'.format(b=s.start, e=s.stop) 
+                    for k, s in gpu.iteritems() }
 
                 import json
-                print 'GPU', json.dumps(gpu, indent=2)
-                print 'CPU', json.dumps(cpu, indent=2)
+                print 'GPU', json.dumps(pretty_gpu, indent=2)
+                print 'CPU', json.dumps(pretty_cpu, indent=2)
                 raise e, None, sys.exc_info()[2]
 
             # Work out the CPU view in the model visibilities
@@ -1093,6 +1136,12 @@ def _sync_wait(future):
         # For easier typing
         C = CompositeRimeSolver
 
+        # Perform any initialisation required by each
+        # thread prior to solving
+        for f in cf.as_completed([ex.submit(self._thread_start_solving)
+                for ex in self.enqueue_executors]):
+            f.result()
+
         # Create an iterator that cycles through each device's
         # executors, also returning the device index (enumerate)
         ex_it = itertools.cycle(enumerate(
@@ -1223,6 +1272,8 @@ def shutdown(self):
         def _shutdown_func():
             for i, subslvr in enumerate(self.thread_local.solvers):
                 subslvr.shutdown()
+                subslvr.dev_mem_pool.stop_holding()
+                subslvr.pinned_mem_pool.stop_holding()
 
             self._thread_shutdown()
 
@@ -1236,7 +1287,7 @@ def _shutdown_func():
 
     def _thread_property_setter(self, name, value):
         for subslvr in self.thread_local.solvers:
-            setter_method_name = self.setter_name(name)
+            setter_method_name = hcu.setter_name(name)
             setter_method = getattr(subslvr, setter_method_name)
             setter_method(value)
 

From 3d5d068bdada467475df3026822d084af641cf68 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 18 Aug 2016 18:29:13 +0100
Subject: [PATCH 21/33] minor change

---
 .../impl/rime/v4/gpu/SersicChiSquaredGradient.py    | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 2ed9cb437..5581ab3d3 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -374,7 +374,7 @@ def __init__(self, weight_vector=False):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        nssrc, ntime, nbl, npolchan = slvr.dim_local_size('nssrc','ntime', 'nbl', 'npolchan')
+        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
 
           # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -406,11 +406,6 @@ def initialise(self, solver, stream=None):
             include_dirs=[montblanc.get_source_path()],
             no_extern_c=True)
 
-        if slvr.is_float():
-            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32)
-        else:
-            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64)
-
         self.rime_const_data = self.mod.get_global('C')
         self.kernel = self.mod.get_function(kname)
         self.launch_params = self.get_launch_params(slvr, D)
@@ -452,7 +447,11 @@ def execute(self, solver, stream=None):
         sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
             else slvr.sersic_shape
 
-        self.gradient.fill(0.) 
+        nssrc = slvr.dim_local_size('nssrc') 
+        if slvr.is_float():
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32)
+        else:
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64)
 
         self.kernel(slvr.uvw, sersic,
             slvr.frequency, slvr.antenna1, slvr.antenna2,

From 7bd82e95ab28a3890c838a2d00d0e5dc1b56c283 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 18 Aug 2016 18:30:06 +0100
Subject: [PATCH 22/33] bug fixed: X2_grad initialisation

---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 84abfcd80..ec2b0a092 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -965,6 +965,7 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             # Iterate again for gradient computation requiring visibilities computed
             # for all source chuncks
             if self.enable_sersic_grad:
+                subslvr.X2_grad.fill(0, stream=subslvr.stream)
                 for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices():
                     # Update our maps with source slice information
                     cpu_slice_map.update(src_cpu_slice_map)
@@ -1190,8 +1191,8 @@ def _accumulate(model_slice, model_chunk):
                     if nvalues > self.throttle_factor:
                         continue
 
-                    if self.enable_sersic_grad:
-                        self.X2_grad = self.ft(np.zeros((3,nssrc)))
+                    #if self.enable_sersic_grad:
+                    #    self.X2_grad = self.ft(np.zeros((3,nssrc)))
 
                     # Enqueue CUDA operations for solving
                     # this visibility chunk. 

From 8e0866ee29c69a9df3e4777f296436951ee141cb Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 18 Aug 2016 19:24:41 +0100
Subject: [PATCH 23/33] minor change

---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index ec2b0a092..ab6e5036b 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -1010,11 +1010,11 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         # Should only be model visibilities
         if self.enable_sersic_grad:
             assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), '
-                'received {l} instead.'.format(l=len(refs)))
+                'received {l} instead.'.format(l=len(new_refs)))
             X2_grad = sim_output_refs['X2_grad'][0]
         else:
             assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), '
-                'received {l} instead.'.format(l=len(refs)))
+                'received {l} instead.'.format(l=len(new_refs)))
 
         model_vis = sim_output_refs['model_vis'][0]
 
@@ -1191,9 +1191,6 @@ def _accumulate(model_slice, model_chunk):
                     if nvalues > self.throttle_factor:
                         continue
 
-                    #if self.enable_sersic_grad:
-                    #    self.X2_grad = self.ft(np.zeros((3,nssrc)))
-
                     # Enqueue CUDA operations for solving
                     # this visibility chunk. 
                     enqueue_future = enq_ex.submit(
@@ -1266,7 +1263,8 @@ def _accumulate(model_slice, model_chunk):
             self.set_X2(X2_sum)
         else:
             self.set_X2(X2_sum/self.sigma_sqrd)
-            self.X2_grad = X2_grad/self.sigma_sqrd
+            if self.enable_sersic_grad:
+                self.X2_grad = X2_grad/self.sigma_sqrd
 
     def shutdown(self):
         """ Shutdown the solver """

From 5abd1e2e87db02993aba5b9847da544ea0265183 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Fri, 19 Aug 2016 11:50:26 +0100
Subject: [PATCH 24/33] change X2_grad classifier into GPU_SCRATCH to manage
 the case where gradient is not enabled

---
 montblanc/impl/rime/v4/config.py              |  2 +-
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index edf14a349..158a93fe0 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -236,5 +236,5 @@ class Classifier(Enum):
     ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
     ary_dict('X2_grad', (3,'nssrc',), 'ft',
-        classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])),
+        classifiers=frozenset([Classifier.GPU_SCRATCH])),
 ]
diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index ab6e5036b..22bb34804 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -1001,6 +1001,12 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             # Enqueue chi-squared copy off the GPU onto the CPU
             X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream)
 
+            # Get pinned memory and copy from the GPU the partial X2_grad
+            if self.enable_sersic_grad:
+                X2_grad = subslvr.pinned_mem_pool.allocate(
+                    shape=subslvr.X2_grad.shape, dtype=subslvr.X2_grad.dtype)
+                subslvr.X2_grad.get_async(ary=X2_grad, stream=subslvr.stream)
+
             # Enqueue transfer of simulator output (model visibilities) to the CPU
             sim_output_refs = self._enqueue_array(subslvr, 
                 cpu_slice_map, gpu_slice_map,
@@ -1008,13 +1014,8 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                 classifiers=[Classifier.SIMULATOR_OUTPUT])
 
         # Should only be model visibilities
-        if self.enable_sersic_grad:
-            assert len(sim_output_refs) == 2, ('Expected two arrays (model visibilities, X2_grad), '
-                'received {l} instead.'.format(l=len(new_refs)))
-            X2_grad = sim_output_refs['X2_grad'][0]
-        else:
-            assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), '
-                'received {l} instead.'.format(l=len(new_refs)))
+        assert len(sim_output_refs) == 1, ('Expected one array (model visibilities), '
+            'received {l} instead.'.format(l=len(new_refs)))
 
         model_vis = sim_output_refs['model_vis'][0]
 

From a65b50f53a1e676101150d5e04405a937a759a61 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Fri, 19 Aug 2016 16:52:15 +0100
Subject: [PATCH 25/33] Fixed X2_grad array management in case of more src
 chunks

---
 .../rime/v4/gpu/SersicChiSquaredGradient.py   |  2 +-
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 26 ++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 5581ab3d3..89ed7b810 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -325,7 +325,7 @@ class SumCohTraits<double> {
         // 3 different threads writes each a different component to avoid serialisation
         if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
         {
-            i = SRC - SRC_START + threadIdx.x*NSSRC;
+            i = SRC - SRC_START + threadIdx.x*DEXT(nssrc);
             atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
         }
     }
diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 22bb34804..8f2586b40 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -181,6 +181,7 @@ def __init__(self, slvr_cfg):
                     key=lambda T: T[1])
 
         P, M, mem = budgets[0]
+        #P[Options.NSRC] = slvr_cfg.get(Options.SOURCE_BATCH_SIZE)
 
         # Log some information about the memory budget
         # and dimension reduction
@@ -965,7 +966,12 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             # Iterate again for gradient computation requiring visibilities computed
             # for all source chuncks
             if self.enable_sersic_grad:
-                subslvr.X2_grad.fill(0, stream=subslvr.stream)
+
+                # Get pinned memory to hold the full X2 gradient
+                nssrc = self.dim_local_size('nssrc')
+                sub_X2_grad = subslvr.pinned_mem_pool.allocate(
+                    shape=(3,nssrc), dtype=subslvr.X2_grad.dtype)
+
                 for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices():
                     # Update our maps with source slice information
                     cpu_slice_map.update(src_cpu_slice_map)
@@ -977,7 +983,9 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                             global_size=self.dimension(name).global_size,
                             lower_extent=slice_.start,
                             upper_extent=slice_.stop)
-    
+
+                    subslvr.X2_grad.fill(0, stream=subslvr.stream)                   
+      
                     # Enqueue Sersic gradient
                     kernel = subslvr.rime_sersic_gradient
                     new_refs = self._enqueue_array(
@@ -990,6 +998,10 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                     _update_refs(pool_refs, {'cdata_gradient' : [cdata_ref]})
                     kernel.execute(subslvr, subslvr.stream)
 
+                    # Enqueue the X2 gradient slice opy off the GPU onto the CPU
+                    src_range = src_cpu_slice_map['nssrc']
+                    subslvr.X2_grad.get_async(ary=sub_X2_grad[:,src_range], stream=subslvr.stream)
+
             # Enqueue chi-squared term reduction and return the
             # GPU array allocated to it
             X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream)
@@ -1001,12 +1013,6 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             # Enqueue chi-squared copy off the GPU onto the CPU
             X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream)
 
-            # Get pinned memory and copy from the GPU the partial X2_grad
-            if self.enable_sersic_grad:
-                X2_grad = subslvr.pinned_mem_pool.allocate(
-                    shape=subslvr.X2_grad.shape, dtype=subslvr.X2_grad.dtype)
-                subslvr.X2_grad.get_async(ary=X2_grad, stream=subslvr.stream)
-
             # Enqueue transfer of simulator output (model visibilities) to the CPU
             sim_output_refs = self._enqueue_array(subslvr, 
                 cpu_slice_map, gpu_slice_map,
@@ -1031,8 +1037,8 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         pool_refs['X2_cpu'].append(sub_X2)
         pool_refs['model_vis_output'].append(model_vis)
         if self.enable_sersic_grad:
-            pool_refs['X2_grad'].append(X2_grad)
-            return (sync_event, sub_X2, model_vis, X2_grad,
+            pool_refs['X2_grad'].append(sub_X2_grad)
+            return (sync_event, sub_X2, model_vis, sub_X2_grad,
                 pool_refs, subslvr.pool_lock,
                 cpu_slice_map.copy(),
                 gpu_slice_map.copy())

From 7457d38869f01436afc5b23bbe674bdb8d98fd9c Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Thu, 25 Aug 2016 10:58:51 +0200
Subject: [PATCH 26/33] Pass results via dict, instead of tuple

Provides more flexibility for different result types
---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 84 +++++++++++--------
 1 file changed, 47 insertions(+), 37 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 5bbb6abd7..1ef142239 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -1049,17 +1049,22 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         pool_refs['X2_gpu'].append(X2_gpu_ary)
         pool_refs['X2_cpu'].append(sub_X2)
         pool_refs['model_vis_output'].append(model_vis)
+
+        data = { 'sync_event' : sync_event,
+            'sub_X2' : sub_X2,
+            'model_vis' : model_vis,
+            'pool_refs' : pool_refs,
+            'pool_lock' : subslvr.pool_lock,
+            'cpu' : cpu_slice_map.copy(),
+            'gpu' : gpu_slice_map.copy(),
+        }
+
+        # Add X2 gradient data
         if self.enable_sersic_grad:
-            pool_refs['X2_grad'].append(sub_X2_grad)
-            return (sync_event, sub_X2, model_vis, sub_X2_grad,
-                pool_refs, subslvr.pool_lock,
-                cpu_slice_map.copy(),
-                gpu_slice_map.copy())
-        else:
-            return (sync_event, sub_X2, model_vis,
-                pool_refs, subslvr.pool_lock,
-                cpu_slice_map.copy(),
-                gpu_slice_map.copy())
+            data['sub_X2_grad'] = sub_X2_grad
+            data['pool_refs']['X2_grad'].append(sub_X2_grad)
+
+        return data
 
     def initialise(self):
         """ Initialise the sub-solver """
@@ -1112,12 +1117,10 @@ def _sync_wait(future):
             Return a copy of the pinned chi-squared, pinned model visibilities
             and index into the CPU array after synchronizing on the cuda_event
             """
-            if self.enable_sersic_grad:
-                cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\
-                    pool_refs, pool_lock, cpu, gpu = future.result()
-            else:
-                cuda_event, pinned_X2, pinned_model_vis, \
-                    pool_refs, pool_lock, cpu, gpu = future.result()
+            data = future.result()
+
+            cuda_event, pool_refs, pool_lock, cpu, gpu = (data[k] for k in
+                ('sync_event', 'pool_refs', 'pool_lock', 'cpu', 'gpu'))
 
             try:
                 cuda_event.synchronize()
@@ -1142,17 +1145,18 @@ def _sync_wait(future):
             model_vis_shape = tuple(cpu[s].stop - cpu[s].start
                 if s in cpu else s for s in model_vis_sshape)
 
-            X2 = pinned_X2.copy()
-            model_vis = pinned_model_vis.copy().reshape(model_vis_shape)
+            result = {
+                'X2' : data['sub_X2'].copy(),
+                'model_vis' : data['model_vis'].copy().reshape(model_vis_shape),
+                'model_vis_idx' : model_vis_idx,
+            }
+
             if self.enable_sersic_grad:
-                X2_grad = pinned_X2_grad.copy()
+                result['X2_grad'] = data['sub_X2_grad'].copy()
 
             _free_pool_allocs(pool_refs, pool_lock)
 
-            if self.enable_sersic_grad:
-                return X2, model_vis, model_vis_idx, X2_grad
-            else:
-                return X2, model_vis, model_vis_idx
+            return result
 
         # For easier typing
         C = CompositeRimeSolver
@@ -1244,15 +1248,18 @@ def _accumulate(model_slice, model_chunk):
                             s.discard(f)
 
                         # Get chi-squared and model visibilities (and X2 gradient)
-                        if self.enable_sersic_grad:
-                            X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                            X2_grad += pinned_X2_grad
-                        else:
-                            X2, pinned_model_vis, model_vis_idx = f.result()
+                        result = f.result()
                         # Sum X2
-                        X2_sum += X2
+                        X2_sum += result['X2']
+
+                        if self.enable_sersic_grad:
+                            X2_sum += result['X2_grad']
+
+                        model_vis_idx = result['model_vis_idx']
+                        pinned_model_vis = result['model_vis']
                         # Write model visibilities to the numpy array
-                        vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:])
+                        vis_write(self.model_vis[model_vis_idx],
+                            pinned_model_vis[:])
 
                         # Break out if we've removed the prescribed
                         # number of futures
@@ -1267,15 +1274,18 @@ def _accumulate(model_slice, model_chunk):
         # Sum remaining chi-squared
         for f in done:
             # Get chi-squared and model visibilities (and X2 gradient)
-            if self.enable_sersic_grad:
-                X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                X2_grad += pinned_X2_grad
-            else:
-                X2, pinned_model_vis, model_vis_idx = f.result()
+            result = f.result()
             # Sum X2
-            X2_sum += X2
+            X2_sum += result['X2']
+
+            if self.enable_sersic_grad:
+                X2_sum += result['X2_grad']
+
+            model_vis_idx = result['model_vis_idx']
+            pinned_model_vis = result['model_vis']
             # Write model visibilities to the numpy array
-            vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:])
+            vis_write(self.model_vis[model_vis_idx],
+                pinned_model_vis[:])
 
         # Set the chi-squared value possibly
         # taking the weight vector into account

From 15e99b84092f07e79596ef0354935b7bc4f9315c Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 8 Sep 2016 17:01:17 +0100
Subject: [PATCH 27/33] Revert "Pass results via dict, instead of tuple"

This reverts commit 7457d38869f01436afc5b23bbe674bdb8d98fd9c.
---
 montblanc/impl/rime/v5/CompositeRimeSolver.py | 84 ++++++++-----------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 1ef142239..5bbb6abd7 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -1049,22 +1049,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         pool_refs['X2_gpu'].append(X2_gpu_ary)
         pool_refs['X2_cpu'].append(sub_X2)
         pool_refs['model_vis_output'].append(model_vis)
-
-        data = { 'sync_event' : sync_event,
-            'sub_X2' : sub_X2,
-            'model_vis' : model_vis,
-            'pool_refs' : pool_refs,
-            'pool_lock' : subslvr.pool_lock,
-            'cpu' : cpu_slice_map.copy(),
-            'gpu' : gpu_slice_map.copy(),
-        }
-
-        # Add X2 gradient data
         if self.enable_sersic_grad:
-            data['sub_X2_grad'] = sub_X2_grad
-            data['pool_refs']['X2_grad'].append(sub_X2_grad)
-
-        return data
+            pool_refs['X2_grad'].append(sub_X2_grad)
+            return (sync_event, sub_X2, model_vis, sub_X2_grad,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
+        else:
+            return (sync_event, sub_X2, model_vis,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
 
     def initialise(self):
         """ Initialise the sub-solver """
@@ -1117,10 +1112,12 @@ def _sync_wait(future):
             Return a copy of the pinned chi-squared, pinned model visibilities
             and index into the CPU array after synchronizing on the cuda_event
             """
-            data = future.result()
-
-            cuda_event, pool_refs, pool_lock, cpu, gpu = (data[k] for k in
-                ('sync_event', 'pool_refs', 'pool_lock', 'cpu', 'gpu'))
+            if self.enable_sersic_grad:
+                cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\
+                    pool_refs, pool_lock, cpu, gpu = future.result()
+            else:
+                cuda_event, pinned_X2, pinned_model_vis, \
+                    pool_refs, pool_lock, cpu, gpu = future.result()
 
             try:
                 cuda_event.synchronize()
@@ -1145,18 +1142,17 @@ def _sync_wait(future):
             model_vis_shape = tuple(cpu[s].stop - cpu[s].start
                 if s in cpu else s for s in model_vis_sshape)
 
-            result = {
-                'X2' : data['sub_X2'].copy(),
-                'model_vis' : data['model_vis'].copy().reshape(model_vis_shape),
-                'model_vis_idx' : model_vis_idx,
-            }
-
+            X2 = pinned_X2.copy()
+            model_vis = pinned_model_vis.copy().reshape(model_vis_shape)
             if self.enable_sersic_grad:
-                result['X2_grad'] = data['sub_X2_grad'].copy()
+                X2_grad = pinned_X2_grad.copy()
 
             _free_pool_allocs(pool_refs, pool_lock)
 
-            return result
+            if self.enable_sersic_grad:
+                return X2, model_vis, model_vis_idx, X2_grad
+            else:
+                return X2, model_vis, model_vis_idx
 
         # For easier typing
         C = CompositeRimeSolver
@@ -1248,18 +1244,15 @@ def _accumulate(model_slice, model_chunk):
                             s.discard(f)
 
                         # Get chi-squared and model visibilities (and X2 gradient)
-                        result = f.result()
-                        # Sum X2
-                        X2_sum += result['X2']
-
                         if self.enable_sersic_grad:
-                            X2_sum += result['X2_grad']
-
-                        model_vis_idx = result['model_vis_idx']
-                        pinned_model_vis = result['model_vis']
+                            X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                            X2_grad += pinned_X2_grad
+                        else:
+                            X2, pinned_model_vis, model_vis_idx = f.result()
+                        # Sum X2
+                        X2_sum += X2
                         # Write model visibilities to the numpy array
-                        vis_write(self.model_vis[model_vis_idx],
-                            pinned_model_vis[:])
+                        vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:])
 
                         # Break out if we've removed the prescribed
                         # number of futures
@@ -1274,18 +1267,15 @@ def _accumulate(model_slice, model_chunk):
         # Sum remaining chi-squared
         for f in done:
             # Get chi-squared and model visibilities (and X2 gradient)
-            result = f.result()
-            # Sum X2
-            X2_sum += result['X2']
-
             if self.enable_sersic_grad:
-                X2_sum += result['X2_grad']
-
-            model_vis_idx = result['model_vis_idx']
-            pinned_model_vis = result['model_vis']
+                X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                X2_grad += pinned_X2_grad
+            else:
+                X2, pinned_model_vis, model_vis_idx = f.result()
+            # Sum X2
+            X2_sum += X2
             # Write model visibilities to the numpy array
-            vis_write(self.model_vis[model_vis_idx],
-                pinned_model_vis[:])
+            vis_write(self.model_vis[model_vis_idx], pinned_model_vis[:])
 
         # Set the chi-squared value possibly
         # taking the weight vector into account

From 4879f6b2ec0dc00a908bb5dbe459185532b5813e Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 8 Sep 2016 17:52:02 +0100
Subject: [PATCH 28/33] replaced atomic addition in the shared memory with the
 block_reduce function

---
 .../rime/v4/gpu/SersicChiSquaredGradient.py   | 53 ++++++++-----------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 89ed7b810..02e7fe463 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -34,15 +34,15 @@
 
 FLOAT_PARAMS = {
     'BLOCKDIMX': 4,    # Number of channels*4 polarisations
-    'BLOCKDIMY': 25,     # Number of baselines
-    'BLOCKDIMZ': 10,     # Number of timesteps
+    'BLOCKDIMY': 8,     # Number of baselines
+    'BLOCKDIMZ': 32,     # Number of timesteps
     'maxregs': 48       # Maximum number of registers
 }
 
 DOUBLE_PARAMS = {
     'BLOCKDIMX': 4,    # Number of channels*4 polarisations
-    'BLOCKDIMY': 25,     # Number of baselines
-    'BLOCKDIMZ': 10,     # Number of timesteps
+    'BLOCKDIMY': 8,     # Number of baselines
+    'BLOCKDIMZ': 32,     # Number of timesteps
     'maxregs': 63       # Maximum number of registers
 }
 
@@ -50,6 +50,7 @@
 #include <stdint.h>
 #include <cstdio>
 #include \"math_constants.h\"
+//#include <cub/block/block_reduce.cuh>
 #include <montblanc/include/abstraction.cuh>
 #include <montblanc/include/jones.cuh>
 
@@ -143,6 +144,12 @@ class SumCohTraits<double> {
     if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan))
         return;
 
+    // --- Specialize BlockReduce for type float. 
+    typedef cub::BlockReduce<T, BLOCKDIMX,cub::BLOCK_REDUCE_WARP_REDUCTIONS,BLOCKDIMY,BLOCKDIMZ> BlockReduceT; 
+
+    // --- Allocate temporary storage in shared memory 
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
     __shared__ struct {
         typename SumCohTraits<T>::UVWType uvw[BLOCKDIMZ][BLOCKDIMY];
 
@@ -223,10 +230,7 @@ class SumCohTraits<double> {
             i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
             i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
         }
-        if(threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
-            shared.X2_grad_part[threadIdx.x] = 0.;
-
-        __syncthreads();
+         __syncthreads();
 
         // Create references to a
         // complicated part of shared memory
@@ -296,31 +300,20 @@ class SumCohTraits<double> {
         {
           dev_vis[p].x *= delta.x;
           dev_vis[p].y *= delta.y;
+            
+          dev_vis[p].x += dev_vis[p].y;
+          dev_vis[p].x *= 6;
 
-          typename Tr::ct other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 2);
-          // Add polarisations 2 and 3 to 0 and 1
-          if((POLCHAN & 0x3) < 2)
-          {
-            dev_vis[p].x += other.x;
-            dev_vis[p].y += other.y;
-          }
-          other = cub::ShuffleIndex(dev_vis[p], cub::LaneId() + 1);
-
-          // If this is the polarisation 0, add polarisation 1
-          // and write out this chi squared grad term
-          if((POLCHAN & 0x3) == 0) 
-          {
-            dev_vis[p].x += other.x;
-            dev_vis[p].y += other.y;
-
-            //atomic addition to avoid concurrent access in the shared memory
-            dev_vis[p].x += dev_vis[p].y;
-            dev_vis[p].x *= 6;
-            atomicAdd(&(shared.X2_grad_part[p]), dev_vis[p].x);
-          }
+          __syncthreads();
+
+          // block reduction, result available only in the first thread
+          T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x);
+          
+          if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0)
+                   shared.X2_grad_part[p] = aggregate;
         }
-        __syncthreads();
         
+        __syncthreads();
         //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
         // 3 different threads writes each a different component to avoid serialisation
         if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)

From 6e30f12d346951ceed462622acdca17023381847 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Mon, 19 Sep 2016 16:33:09 +0100
Subject: [PATCH 29/33] fixed block_reduce bug in case of block dimensions not
 factors of nbaselines and ntimes

---
 .../rime/v4/gpu/SersicChiSquaredGradient.py   | 150 +++++++++---------
 1 file changed, 79 insertions(+), 71 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 02e7fe463..1a8a2ab7d 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -34,7 +34,7 @@
 
 FLOAT_PARAMS = {
     'BLOCKDIMX': 4,    # Number of channels*4 polarisations
-    'BLOCKDIMY': 8,     # Number of baselines
+    'BLOCKDIMY': 8,      # Number of baselines
     'BLOCKDIMZ': 32,     # Number of timesteps
     'maxregs': 48       # Maximum number of registers
 }
@@ -86,7 +86,7 @@ class SumCohTraits<double> {
 #define LEXT(name) C.name.lower_extent
 #define UEXT(name) C.name.upper_extent
 #define DEXT(name) (C.name.upper_extent - C.name.lower_extent)
-
+ 
 #define NA (C.na.local_size)
 #define NBL (C.nbl.local_size)
 #define NCHAN (C.nchan.local_size)
@@ -140,10 +140,7 @@ class SumCohTraits<double> {
     int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
     int BL = blockIdx.y*blockDim.y + threadIdx.y;
     int TIME = blockIdx.z*blockDim.z + threadIdx.z;
-
-    if(TIME >= DEXT(ntime) || BL >= DEXT(nbl) || POLCHAN >= DEXT(npolchan))
-        return;
-
+ 
     // --- Specialize BlockReduce for type float. 
     typedef cub::BlockReduce<T, BLOCKDIMX,cub::BLOCK_REDUCE_WARP_REDUCTIONS,BLOCKDIMY,BLOCKDIMZ> BlockReduceT; 
 
@@ -163,22 +160,28 @@ class SumCohTraits<double> {
 
     } shared;
 
+    int i, ANT1, ANT2;
     typename Tr::ct dev_vis[3];
+    dev_vis[0].x = 0.; dev_vis[1].x = 0; dev_vis[2].x = 0;
+    dev_vis[0].y = 0.; dev_vis[1].y = 0; dev_vis[2].y = 0;
+    typename Tr::ct delta;
+    delta.x = 0.; delta.y = 0.;
 
-    T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
-    T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
-    T & W = shared.uvw[threadIdx.z][threadIdx.y].z; 
-
-    int i;
-
-    // Figure out the antenna pairs
-    i = TIME*NBL + BL;   
-    int ANT1 = antenna1[i];
-    int ANT2 = antenna2[i];
-
-    // UVW coordinates vary by baseline and time, but not polarised channel
-    if(threadIdx.x == 0)
+    if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan))
     {
+
+      T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
+      T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
+      T & W = shared.uvw[threadIdx.z][threadIdx.y].z; 
+    
+      // Figure out the antenna pairs
+      i = TIME*NBL + BL;   
+      ANT1 = antenna1[i];
+      ANT2 = antenna2[i];
+  
+      // UVW coordinates vary by baseline and time, but not polarised channel
+      if(threadIdx.x == 0)
+      {
         // UVW, calculated from u_pq = u_p - u_q
         i = TIME*NA + ANT1;
         shared.uvw[threadIdx.z][threadIdx.y] = uvw[i];
@@ -188,49 +191,53 @@ class SumCohTraits<double> {
         U -= ant2_uvw.x;
         V -= ant2_uvw.y;
         W -= ant2_uvw.z;
-    }
+      }
 
-    // Wavelength varies by channel, but not baseline and time
-    // TODO uses 4 times the actually required space, since
-    // we don't need to store a frequency per polarisation
-    if(threadIdx.y == 0 && threadIdx.z == 0)
+      // Wavelength varies by channel, but not baseline and time
+      // TODO uses 4 times the actually required space, since
+      // we don't need to store a frequency per polarisation
+      if(threadIdx.y == 0 && threadIdx.z == 0)
         { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
 
-    i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN;
-    typename Tr::ct delta = observed_vis[i];
-    delta.x -= visibilities[i].x;
-    delta.y -= visibilities[i].y;
+      i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN;
+      delta = observed_vis[i];
+      delta.x -= visibilities[i].x;
+      delta.y -= visibilities[i].y;
 
-    // Zero the polarisation if it is flagged
-    if(flag[i] > 0)
-    {
+      // Zero the polarisation if it is flagged
+      if(flag[i] > 0)
+      {
         delta.x = 0; delta.y = 0;
-    }
+      }
 
-    // Apply any necessary weighting factors
-    if(apply_weights)
-    {
+      // Apply any necessary weighting factors
+      if(apply_weights)
+      {
         T w = weight_vector[i];
         delta.x *= w;
         delta.y *= w;
+      }
     }
-
     int SRC_START = DEXT(npsrc) + DEXT(ngsrc);
     int SRC_STOP = SRC_START + DEXT(nssrc);
 
     // Loop over Sersic Sources
     for(int SRC = SRC_START; SRC < SRC_STOP; ++SRC)
     {
-        // sersic shape only varies by source. Shape parameters
-        // thus apply to the entire block and we can load them with
-        // only the first thread.
-        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
-        {
-            i = SRC - SRC_START;                 shared.e1 = sersic_shape[i];
-            i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
-            i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
-        }
-         __syncthreads();
+
+      // sersic shape only varies by source. Shape parameters
+      // thus apply to the entire block and we can load them with
+      // only the first thread.
+      if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+      {
+          i = SRC - SRC_START;                 shared.e1 = sersic_shape[i];
+          i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
+          i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
+      }
+      __syncthreads();
+
+      if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan))
+      {
 
         // Create references to a
         // complicated part of shared memory
@@ -279,7 +286,7 @@ class SumCohTraits<double> {
         dev_vis[1].y = ant_one.y*dev_e2;
         dev_vis[2].x = ant_one.x*dev_scale;
         dev_vis[2].y = ant_one.y*dev_scale;
-   /*
+   
         for (int p = 0; p < 3; p++)
         {
           // Multiply the visibility derivative by antenna 1's g term
@@ -293,35 +300,36 @@ class SumCohTraits<double> {
           montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant1_g_term, ant2_g_term);    
           dev_vis[p] = ant1_g_term;
         }
-   
-    */
-        // Write partial derivative with respect to sersic parameters
-        for (int p=0; p<3; p++)
-        {
-          dev_vis[p].x *= delta.x;
-          dev_vis[p].y *= delta.y;
+ 
+      }   
+      // Write partial derivative with respect to sersic parameters
+      for (int p=0; p<3; p++)
+      {
+        dev_vis[p].x *= delta.x;
+        dev_vis[p].y *= delta.y;
             
-          dev_vis[p].x += dev_vis[p].y;
-          dev_vis[p].x *= 6;
-
-          __syncthreads();
+        dev_vis[p].x += dev_vis[p].y;
+        dev_vis[p].x *= 6;
+         
+        __syncthreads();
 
-          // block reduction, result available only in the first thread
-          T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x);
+         // block reduction, result available only in the first thread
+         T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x);
           
-          if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0)
-                   shared.X2_grad_part[p] = aggregate;
-        }
+         if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0)
+                 shared.X2_grad_part[p] = aggregate;
+      }
         
-        __syncthreads();
-        //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
-        // 3 different threads writes each a different component to avoid serialisation
-        if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
-        {
-            i = SRC - SRC_START + threadIdx.x*DEXT(nssrc);
-            atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
-        }
+      __syncthreads();
+      //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
+      // 3 different threads writes each a different component to avoid serialisation
+      if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
+      {
+          i = SRC - SRC_START + threadIdx.x*DEXT(nssrc);
+          atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
+      }
     }
+  
 }
 
 extern "C" {

From 2795b5ff24788c0a3c59da97b171065801875c9e Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Wed, 10 May 2017 17:45:47 +0100
Subject: [PATCH 30/33] Enabled single polarization for GPU version

---
 montblanc/factory.py                          |   3 +-
 montblanc/impl/common/loaders/loaders.py      |  12 +-
 montblanc/impl/rime/v4/config.py              |  29 +-
 montblanc/impl/rime/v4/gpu/RimeBSqrt.py       |  86 +++++-
 montblanc/impl/rime/v4/gpu/RimeEBeam.py       | 287 +++++++++++++++++-
 montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py     |  13 +-
 .../impl/rime/v4/gpu/RimeSumCoherencies.py    |  73 +++--
 .../rime/v4/gpu/SersicChiSquaredGradient.py   |  18 +-
 montblanc/impl/rime/v4/loaders/loaders.py     |   2 +-
 montblanc/impl/rime/v5/CompositeRimeSolver.py |  11 +-
 montblanc/impl/rime/v5/loaders/loaders.py     |   2 +-
 11 files changed, 472 insertions(+), 64 deletions(-)

diff --git a/montblanc/factory.py b/montblanc/factory.py
index f9d3f4ad9..eae4c4b54 100644
--- a/montblanc/factory.py
+++ b/montblanc/factory.py
@@ -137,12 +137,13 @@ def create_rime_solver_from_ms(slvr_class_type, slvr_cfg):
     autocor = slvr_cfg.get(Options.AUTO_CORRELATIONS)
 
     with MeasurementSetLoader(msfile, auto_correlations=autocor) as loader:
-        ntime, nbl, na, nbands, nchan = loader.get_dims()
+        ntime, nbl, na, nbands, nchan, npol = loader.get_dims()
         slvr_cfg[Options.NTIME] = ntime
         slvr_cfg[Options.NA] = na
         slvr_cfg[Options.NBL] = nbl
         slvr_cfg[Options.NBANDS] = nbands
         slvr_cfg[Options.NCHAN] = nchan
+        slvr_cfg[Options.NPOL] = npol
         slvr = slvr_class_type(slvr_cfg)
         loader.load(slvr, slvr_cfg)
         return slvr
diff --git a/montblanc/impl/common/loaders/loaders.py b/montblanc/impl/common/loaders/loaders.py
index b6f31d9cd..3ac524989 100644
--- a/montblanc/impl/common/loaders/loaders.py
+++ b/montblanc/impl/common/loaders/loaders.py
@@ -31,6 +31,7 @@
 ANTENNA_TABLE = 'ANTENNA'
 SPECTRAL_WINDOW = 'SPECTRAL_WINDOW'
 FIELD_TABLE = 'FIELD'
+POL_TABLE = 'POLARIZATION'
 
 class MeasurementSetLoader(BaseLoader):
     LOG_PREFIX = 'LOADER:'
@@ -43,6 +44,7 @@ def __init__(self, msfile, auto_correlations=False):
         self.antfile = '::'.join((self.msfile, ANTENNA_TABLE))
         self.freqfile = '::'.join((self.msfile, SPECTRAL_WINDOW))
         self.fieldfile = '::'.join((self.msfile, FIELD_TABLE))
+        self.polfile = '::'.join((self.msfile, POL_TABLE))
 
         montblanc.log.info("{lp} Opening Measurement Set {ms}.".format(
             lp=self.LOG_PREFIX, ms=self.msfile))
@@ -58,6 +60,7 @@ def __init__(self, msfile, auto_correlations=False):
         # (2) baseline (ANTENNA1, ANTENNA2)
         # (3) band (SPECTRAL_WINDOW_ID via DATA_DESC_ID)
         ordering_query = ' '.join(["SELECT FROM $ms",
+            #"WHERE ANTENNA1 != ANTENNA2",
             "WHERE FIELD_ID={fid}".format(fid=field_id),
             "" if auto_correlations else "AND ANTENNA1 != ANTENNA2",
             "ORDERBY TIME, ANTENNA1, ANTENNA2, "
@@ -70,6 +73,7 @@ def __init__(self, msfile, auto_correlations=False):
         self.tables['ant']  = at = pt.table(self.antfile, ack=False, readonly=True)
         self.tables['freq'] = ft = pt.table(self.freqfile, ack=False, readonly=True)
         self.tables['field'] = fit = pt.table(self.fieldfile, ack=False, readonly=True)
+        self.tables['pol'] = polt = pt.table(self.polfile, ack=False, readonly=True)
 
         self.nrows = ordered_ms.nrows()
         self.na = at.nrows()
@@ -80,6 +84,9 @@ def __init__(self, msfile, auto_correlations=False):
         bl_query = "SELECT FROM $ordered_ms ORDERBY UNIQUE ANTENNA1, ANTENNA2"
         self.nbl = pt.taql(bl_query).nrows()
 
+        # Number of polarizations (assuming to be the same for all spectral windows)
+        self.npol = polt.getcol('NUM_CORR')[0]
+
         # Number of channels per band
         chan_per_band = ft.getcol('NUM_CHAN')
 
@@ -105,6 +112,8 @@ def __init__(self, msfile, auto_correlations=False):
             aeq=autocor_eq, ebl=expected_nbl, astr=autocor_str, nbl=self.nbl))
         self.log("Found {nb} band(s), containing {cpb} channels.".format(
             nb=self.nbands, nc=chan_per_band[0], cpb=chan_per_band))
+        self.log("Found {npol} polarization(s) in the POLARIZATION table.".format(
+            npol=self.npol))
 
         # Sanity check computed rows vs actual rows
         computed_rows = self.ntime*self.nbl*self.nbands
@@ -116,13 +125,14 @@ def __init__(self, msfile, auto_correlations=False):
                         nbl=self.nbl, nb=self.nbands,
                         cr=computed_rows, msr=self.nrows))
 
+
     def get_dims(self):
         """
         Returns a tuple with the number of
         timesteps, baselines, antenna, channel bands and channels
         """
         # Determine the problem dimensions
-        return self.ntime, self.nbl, self.na, self.nbands, self.nchan
+        return self.ntime, self.nbl, self.na, self.nbands, self.nchan, self.npol
 
     def log(self, msg, *args, **kwargs):
         montblanc.log.info('{lp} {m}'.format(lp=self.LOG_PREFIX, m=msg),
diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index d88bfb2d3..bf801d3ed 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -127,6 +127,8 @@ class Classifier(Enum):
     EKB_SQRT_INPUT = 'ekb_sqrt_input'
     COHERENCIES_INPUT = 'coherencies_input'
 
+
+
 # List of arrays
 A = [
     # UVW coordinates
@@ -177,15 +179,17 @@ class Classifier(Enum):
         default=1,
         test=lambda slvr, ary: rary(ary)),
 
-    ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 4), 'ct',
+    ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 'npol'), 'ct',
         classifiers=frozenset([Classifier.E_BEAM_INPUT]),
-        default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:],
+        default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
     # Direction-Independent Effects
-    ary_dict('G_term', ('ntime', 'na', 'nchan', 4), 'ct',
+    ary_dict('G_term', ('ntime', 'na', 'nchan', 'npol'), 'ct',
         classifiers=frozenset([Classifier.COHERENCIES_INPUT]),
-        default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:],
+        default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
     # Source Definitions
@@ -195,9 +199,10 @@ class Classifier(Enum):
         default=0,
         test=lambda slvr, ary: (rary(ary)-0.5) * 1e-1),
 
-    ary_dict('stokes', ('nsrc','ntime', 4), 'ft',
+    ary_dict('stokes', ('nsrc','ntime', 'npol'), 'ft',
         classifiers=frozenset([Classifier.B_SQRT_INPUT]),
-        default=np.array([1,0,0,0])[np.newaxis,np.newaxis,:],
+        default= lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[2] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=rand_stokes),
 
     ary_dict('alpha', ('nsrc','ntime'), 'ft',
@@ -216,7 +221,7 @@ class Classifier(Enum):
         test=rand_sersic_shape),
 
     # Visibility flagging arrays
-    ary_dict('flag', ('ntime', 'nbl', 'nchan', 4), np.uint8,
+    ary_dict('flag', ('ntime', 'nbl', 'nchan', 'npol'), np.uint8,
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=0,
@@ -224,23 +229,23 @@ class Classifier(Enum):
             0, 1, size=ary.shape)),
 
     # Bayesian Data
-    ary_dict('weight_vector', ('ntime','nbl','nchan',4), 'ft',
+    ary_dict('weight_vector', ('ntime','nbl','nchan','npol'), 'ft',
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=1,
         test=lambda slvr, ary: rary(ary)),
-    ary_dict('observed_vis', ('ntime','nbl','nchan',4), 'ct',
+    ary_dict('observed_vis', ('ntime','nbl','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=0,
         test=lambda slvr, ary: rary(ary)),
 
     # Result arrays
-    ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 4), 'ct',
+    ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 'npol'), 'ct',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
-    ary_dict('jones', ('nsrc','ntime','na','nchan',4), 'ct',
+    ary_dict('jones', ('nsrc','ntime','na','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
-    ary_dict('model_vis', ('ntime','nbl','nchan',4), 'ct',
+    ary_dict('model_vis', ('ntime','nbl','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])),
     ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
diff --git a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
index 784a2152a..1e56313fe 100644
--- a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
+++ b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
@@ -127,6 +127,59 @@
     B_sqrt[i] = B_square_root;
 }
 
+
+
+// Case NPOL = 1
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void rime_jones1_B_sqrt_impl(
+    T * stokes,
+    T * alpha,
+    T * frequency,
+    T * ref_frequency,
+    typename Tr::ct * B_sqrt)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int TIME = blockIdx.y*blockDim.y + threadIdx.y;
+    int SRC = blockIdx.z*blockDim.z + threadIdx.z;
+
+    if(SRC >= DEXT(nsrc) || TIME >= DEXT(ntime) || POLCHAN >= DEXT(nchan))
+        { return; }
+
+    __shared__ T freq_ratio[BLOCKDIMX];
+
+    // TODO. Using 3 times more shared memory than we
+    // really require here, since there's only
+    // one frequency per channel.
+    if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+        freq_ratio[threadIdx.x] = frequency[POLCHAN] / 
+            ref_frequency[POLCHAN];
+    }
+
+    __syncthreads();
+
+    // Calculate the power term
+    int i = SRC*NTIME + TIME;
+    typename Tr::ft power = Po::pow(freq_ratio[threadIdx.x], alpha[i]);
+
+    // Read in the stokes parameter,
+    // multiplying it by the power term
+    typename Tr::ft pol = stokes[i]*power;
+
+    // Write out the square root of the brightness
+    i = (SRC*NTIME + TIME)*NPOLCHAN + POLCHAN;
+    B_sqrt[i].x = Po::sqrt(pol);
+    B_sqrt[i].y = 0.;
+}
+
+
+
+
 extern "C" {
 
 #define stamp_jones_B_sqrt_fn(ft,ct) \
@@ -142,9 +195,27 @@
         frequency, ref_frequency, B_sqrt); \
 }
 
+#define stamp_jones1_B_sqrt_fn(ft,ct) \
+__global__ void \
+rime_jones1_B_sqrt_ ## ft( \
+    ft * stokes, \
+    ft * alpha, \
+    ft * frequency, \
+    ft * ref_frequency, \
+    ct * B_sqrt) \
+{ \
+    rime_jones1_B_sqrt_impl<ft>(stokes, alpha, \
+        frequency, ref_frequency, B_sqrt); \
+}
+
+
 stamp_jones_B_sqrt_fn(float,float2);
 stamp_jones_B_sqrt_fn(double,double2);
 
+stamp_jones1_B_sqrt_fn(float,float2);
+stamp_jones1_B_sqrt_fn(double,double2);
+
+
 } // extern "C" {
 """)
 
@@ -154,8 +225,8 @@ def __init__(self):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        nsrc, ntime, npolchan = slvr.dim_local_size(
-            'nsrc', 'ntime', 'npolchan')
+        nsrc, ntime, npolchan, npol = slvr.dim_local_size(
+            'nsrc', 'ntime', 'npolchan', 'npol')
 
         # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -171,9 +242,14 @@ def initialise(self, solver, stream=None):
         regs = str(FLOAT_PARAMS['maxregs'] \
                 if slvr.is_float() else DOUBLE_PARAMS['maxregs'])
 
-        kname = 'rime_jones_B_sqrt_float' \
-            if slvr.is_float() is True else \
-            'rime_jones_B_sqrt_double'
+        if npol == 1:
+            kname = 'rime_jones1_B_sqrt_float' \
+                if slvr.is_float() is True else \
+                'rime_jones1_B_sqrt_double'
+        else:
+            kname = 'rime_jones_B_sqrt_float' \
+                if slvr.is_float() is True else \
+                'rime_jones_B_sqrt_double'
 
         kernel_string = KERNEL_TEMPLATE.substitute(**D)
 
diff --git a/montblanc/impl/rime/v4/gpu/RimeEBeam.py b/montblanc/impl/rime/v4/gpu/RimeEBeam.py
index c174f2640..1d9215406 100644
--- a/montblanc/impl/rime/v4/gpu/RimeEBeam.py
+++ b/montblanc/impl/rime/v4/gpu/RimeEBeam.py
@@ -83,18 +83,19 @@
 #define BEAM_MH LOCAL(beam_mh)
 #define BEAM_NUD LOCAL(beam_nud)
 
+// Infer channel from x thread
+ __device__ __forceinline__ int thread_chan()
+    { return threadIdx.x >> 2; }
+
 // Infer polarisation from x thread
 __device__ __forceinline__ int ebeam_pol()
     { return threadIdx.x & 0x3; }
 
-// Infer channel from x thread
-__device__ __forceinline__ int thread_chan()
-    { return threadIdx.x >> 2; }
 
 template <
     typename T,
     typename Tr=montblanc::kernel_traits<T>,
-    typename Po=montblanc::kernel_policies<T> >
+    typename Po=montblanc::kernel_policies<T> > 
 __device__ __forceinline__
 void trilinear_interpolate(
     typename Tr::ct & pol_sum,
@@ -105,8 +106,31 @@
     const T & gchan,
     const T & weight)
 {
-    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL
-        + ebeam_pol();
+    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL +ebeam_pol();
+
+    // Perhaps unnecessary as long as BLOCKDIMX is 32
+    typename Tr::ct pol = cub::ThreadLoad<cub::LOAD_LDG>(E_beam + i);
+    pol_sum.x += weight*pol.x;
+    pol_sum.y += weight*pol.y;
+    abs_sum += weight*Po::abs(pol);
+}
+
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> > 
+__device__ __forceinline__
+void trilinear_interpolate_J1(
+    typename Tr::ct & pol_sum,
+    typename Tr::ft & abs_sum,
+    typename Tr::ct * E_beam,
+    const T & gl,
+    const T & gm,
+    const T & gchan,
+    const T & weight)
+{
+    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan));
 
     // Perhaps unnecessary as long as BLOCKDIMX is 32
     typename Tr::ct pol = cub::ThreadLoad<cub::LOAD_LDG>(E_beam + i);
@@ -115,6 +139,7 @@
     abs_sum += weight*Po::abs(pol);
 }
 
+
 template <typename T> class EBeamTraits {};
 
 template <> class EBeamTraits<float>
@@ -343,6 +368,216 @@
     }
 }
 
+
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void rime_jones1_E_beam_impl(
+    typename EBeamTraits<T>::LMType * lm,
+    typename EBeamTraits<T>::ParallacticAngleType * parallactic_angles,
+    typename EBeamTraits<T>::PointErrorType * point_errors,
+    typename EBeamTraits<T>::AntennaScaleType * antenna_scaling,
+    typename Tr::ft * frequency,
+    typename Tr::ct * E_beam,
+    typename Tr::ct * jones,
+    T beam_ll, T beam_lm, T beam_lfreq,
+    T beam_ul, T beam_um, T beam_ufreq)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int ANT = blockIdx.y*blockDim.y + threadIdx.y;
+    int TIME = blockIdx.z*blockDim.z + threadIdx.z;
+
+    typedef typename EBeamTraits<T>::LMType LMType;
+    typedef typename EBeamTraits<T>::PointErrorType PointErrorType;
+    typedef typename EBeamTraits<T>::AntennaScaleType AntennaScaleType;
+
+    if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan))
+        return;
+
+    __shared__ struct {
+        T lscale;             // l axis scaling factor
+        T mscale;             // m axis scaling factor
+        T pa_sin[BLOCKDIMZ][BLOCKDIMY];  // sin of parallactic angle
+        T pa_cos[BLOCKDIMZ][BLOCKDIMY];  // cos of parallactic angle
+        T gchan0[BLOCKDIMX];  // channel grid position (snapped)
+        T gchan1[BLOCKDIMX];  // channel grid position (snapped)
+        T chd[BLOCKDIMX];    // difference between gchan0 and actual grid position
+        PointErrorType pe[BLOCKDIMZ][BLOCKDIMY][BLOCKDIMX];  // pointing errors
+        AntennaScaleType as[BLOCKDIMY][BLOCKDIMX];           // antenna scaling
+    } shared;
+
+
+    int i;
+
+    // Precompute l and m scaling factors in shared memory
+    if(threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+    {
+        shared.lscale = T(BEAM_LW - 1) / (beam_ul - beam_ll);
+        shared.mscale = T(BEAM_MH - 1) / (beam_um - beam_lm);
+    }
+
+    // Pointing errors vary by time, antenna and channel,
+    i = (TIME*NA + ANT)*NCHAN + POLCHAN;
+    shared.pe[threadIdx.z][threadIdx.y][threadIdx.x] = point_errors[i];
+
+    // Antenna scaling factors vary by antenna and channel,
+    // but not timestep
+    if(threadIdx.z == 0)
+    {
+        i = ANT*NCHAN + POLCHAN;
+        shared.as[threadIdx.y][threadIdx.x] = antenna_scaling[i];
+    }
+
+    // Frequency vary by channel, but not timestep or antenna
+    if(threadIdx.z == 0 && threadIdx.y == 0)
+    {
+        // Channel coordinate
+        // channel grid position
+        T freqscale = T(BEAM_NUD-1) / (beam_ufreq - beam_lfreq);
+        T chan = freqscale * (frequency[POLCHAN] - beam_lfreq);
+        // clamp to grid edges
+        chan = Po::clamp(chan, 0.0, BEAM_NUD-1);
+        // Snap to grid coordinate
+        shared.gchan0[threadIdx.x] = Po::floor(chan);
+        shared.gchan1[threadIdx.x] = Po::min(
+            shared.gchan0[threadIdx.x] + 1.0, BEAM_NUD-1);
+        // Offset of snapped coordinate from grid position
+        shared.chd[threadIdx.x] = chan - shared.gchan0[threadIdx.x];
+    }
+
+    // Parallactic angles vary by time and antenna, but not channel
+    if(threadIdx.x == 0)
+    {
+        i = TIME*NA + ANT;
+        T parangle = parallactic_angles[i];
+        Po::sincos(parangle,
+            &shared.pa_sin[threadIdx.z][threadIdx.y],
+            &shared.pa_cos[threadIdx.z][threadIdx.y]);
+    }
+
+    __syncthreads();
+
+    // Loop over sources
+    for(int SRC=0; SRC < DEXT(nsrc); ++SRC)
+    {
+        // lm coordinate for this source
+        LMType rlm = lm[SRC];
+
+        // L coordinate
+        // Rotate
+        T l = rlm.x*shared.pa_cos[threadIdx.z][threadIdx.y] -
+            rlm.y*shared.pa_sin[threadIdx.z][threadIdx.y];
+        // Add the pointing errors for this antenna.
+        l += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].x;
+        // Scale by antenna scaling factors
+        l *= shared.as[threadIdx.y][threadIdx.x].x;
+        // l grid position
+        l = shared.lscale * (l - beam_ll);
+        // clamp to grid edges
+        l = Po::clamp(0.0, l, BEAM_LW-1);
+        // Snap to grid coordinate
+        T gl0 = Po::floor(l);
+        T gl1 = Po::min(gl0 + 1.0, BEAM_LW-1);
+        // Offset of snapped coordinate from grid position
+        T ld = l - gl0;
+
+        // M coordinate
+        // rotate
+        T m = rlm.x*shared.pa_sin[threadIdx.z][threadIdx.y] +
+            rlm.y*shared.pa_cos[threadIdx.z][threadIdx.y];
+        // Add the pointing errors for this antenna.
+        m += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].y;
+        // Scale by antenna scaling factors
+        m *= shared.as[threadIdx.y][threadIdx.x].y;
+        // m grid position
+        m = shared.mscale * (m - beam_lm);
+        // clamp to grid edges
+        m = Po::clamp(0.0, m, BEAM_MH-1);
+        // Snap to grid position
+        T gm0 = Po::floor(m);
+        T gm1 = Po::min(gm0 + 1.0, BEAM_MH-1);
+        // Offset of snapped coordinate from grid position
+        T md = m - gm0;
+
+        typename Tr::ct pol_sum = Po::make_ct(0.0, 0.0);
+        typename Tr::ft abs_sum = T(0.0);
+
+        // A simplified trilinear weighting is used here. Given
+        // point x between points x1 and x2, with function f
+        // provided values f(x1) and f(x2) at these points.
+        //
+        // x1 ------- x ---------- x2
+        //
+        // Then, the value of f can be approximated using the following:
+        // f(x) ~= f(x1)(x2-x)/(x2-x1) + f(x2)(x-x1)/(x2-x1)
+        //
+        // Note how the value f(x1) is weighted with the distance
+        // from the opposite point (x2-x).
+        //
+        // As we are interpolating on a grid, we have the following
+        // 1. (x2 - x1) == 1
+        // 2. (x - x1)  == 1 - 1 + (x - x1)
+        //              == 1 - (x2 - x1) + (x - x1)
+        //              == 1 - (x2 - x)
+        // 2. (x2 - x)  == 1 - 1 + (x2 - x)
+        //              == 1 - (x2 - x1) + (x2 - x)
+        //              == 1 - (x - x1)
+        //
+        // Extending the above to 3D, we have
+        // f(x,y,z) ~= f(x1,y1,z1)(x2-x)(y2-y)(z2-z) + ...
+        //           + f(x2,y2,z2)(x-x1)(y-y1)(z-z1)
+        //
+        // f(x,y,z) ~= f(x1,y1,z1)(1-(x-x1))(1-(y-y1))(1-(z-z1)) + ...
+        //           + f(x2,y2,z2)   (x-x1)    (y-y1)    (z-z1)
+
+        // Load in the complex values from the E beam
+        // at the supplied coordinate offsets.
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm0, shared.gchan0[threadIdx.x],
+            (1.0f-ld)*(1.0f-md)*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm0, shared.gchan0[threadIdx.x],
+            ld*(1.0f-md)*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm1, shared.gchan0[threadIdx.x],
+            (1.0f-ld)*md*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm1, shared.gchan0[threadIdx.x],
+            ld*md*(1.0f-shared.chd[threadIdx.x]));
+
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm0, shared.gchan1[threadIdx.x],
+            (1.0f-ld)*(1.0f-md)*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm0, shared.gchan1[threadIdx.x],
+            ld*(1.0f-md)*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm1, shared.gchan1[threadIdx.x],
+            (1.0f-ld)*md*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm1, shared.gchan1[threadIdx.x],
+            ld*md*shared.chd[threadIdx.x]);
+
+        // Normalise the angle and multiply in the absolute sum
+        typename Tr::ft norm = Po::rsqrt(pol_sum.x*pol_sum.x + pol_sum.y*pol_sum.y);
+        if(!::isfinite(norm))
+            { norm = 1.0; }
+
+        pol_sum.x *= norm * abs_sum;
+        pol_sum.y *= norm * abs_sum;
+
+        i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
+        jones[i] = pol_sum;
+    }
+}
+
+
+
+
+
 extern "C" {
 
 #define stamp_jones_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \
@@ -365,6 +600,26 @@
         beam_ul, beam_um, beam_ufreq); \
 }
 
+#define stamp_jones1_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \
+__global__ void \
+rime_jones1_E_beam_ ## ft( \
+    lm_type * lm, \
+    pa_type * parallactic_angles, \
+    pe_type * point_errors, \
+    as_type * antenna_scaling, \
+    ft * frequency, \
+    ct * E_beam, \
+    ct * jones, \
+    ft beam_ll, ft beam_lm, ft beam_lfreq, \
+    ft beam_ul, ft beam_um, ft beam_ufreq) \
+{ \
+    rime_jones1_E_beam_impl<ft>( \
+        lm, parallactic_angles, point_errors, \
+        antenna_scaling, frequency, E_beam, jones, \
+        beam_ll, beam_lm, beam_lfreq, \
+        beam_ul, beam_um, beam_ufreq); \
+}
+
 ${stamp_function}
 
 } // extern "C" {
@@ -376,7 +631,7 @@ def __init__(self):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan')
+        ntime, na, npolchan, npol = slvr.dim_local_size('ntime', 'na', 'npolchan', 'npol')
 
         # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -400,12 +655,20 @@ def initialise(self, solver, stream=None):
             'float' if slvr.is_float() else 'double',
             'float2' if slvr.is_float() else 'double2',
             'float2' if slvr.is_float() else 'double2'])
-        stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')'])
-        D['stamp_function'] = stamp_fn
 
-        kname = 'rime_jones_E_beam_float' \
-            if slvr.is_float() is True else \
-            'rime_jones_E_beam_double'
+        if npol == 4:
+            stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')'])
+            D['stamp_function'] = stamp_fn
+            kname = 'rime_jones_E_beam_float' \
+                if slvr.is_float() is True else \
+                'rime_jones_E_beam_double'
+
+        if npol == 1:
+            stamp_fn = ''.join(['stamp_jones1_E_beam_fn(', stamp_args, ')'])
+            D['stamp_function'] = stamp_fn
+            kname = 'rime_jones1_E_beam_float' \
+                if slvr.is_float() is True else \
+                'rime_jones1_E_beam_double'
 
         kernel_string = KERNEL_TEMPLATE.substitute(**D)
 
diff --git a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
index ba802f801..325d0800a 100644
--- a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
+++ b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
@@ -110,7 +110,6 @@ class EKBTraits<double> {
     int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
     int ANT = blockIdx.y*blockDim.y + threadIdx.y;
     int TIME = blockIdx.z*blockDim.z + threadIdx.z;
-    #define POL (threadIdx.x & 0x3)
 
     if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan))
         return;
@@ -134,7 +133,13 @@ class EKBTraits<double> {
 
     // Wavelengths vary by channel, not by time and antenna
     if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+      if (NPOL > 1)
         { freq[threadIdx.x] = frequency[POLCHAN>>2]; }
+      else
+        { freq[threadIdx.x] = frequency[POLCHAN]; }
+    }
+   
 
     __syncthreads();
 
@@ -168,7 +173,11 @@ class EKBTraits<double> {
         i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
         // Load in the E Beam, and multiply it by KB
         typename Tr::ct J = jones[i];
-        montblanc::jones_multiply_4x4_in_place<T>(J, cplx_phase);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_in_place<T>(J, cplx_phase);
+        else
+          montblanc::complex_multiply_in_place<T>(J, cplx_phase);
+       
 
         // Write out the jones matrices
         i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
diff --git a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
index 991f208fc..049554af7 100644
--- a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
+++ b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
@@ -174,7 +174,12 @@ class SumCohTraits<double> {
     // TODO uses 4 times the actually required space, since
     // we don't need to store a frequency per polarisation
     if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+      if (NPOL > 1)
         { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+      else
+        { shared.freq[threadIdx.x] = frequency[POLCHAN]; }
+    }
 
     // We process sources in batches, accumulating visibility values.
     // Visibilities are read in and written out at the start and end
@@ -202,7 +207,10 @@ class SumCohTraits<double> {
         typename Tr::ct ant_one = jones[i];
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+           montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+           montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -248,7 +256,11 @@ class SumCohTraits<double> {
         ant_one.y *= exp;
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
+       
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -298,7 +310,10 @@ class SumCohTraits<double> {
         ant_one.y *= sersic_factor;
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -320,12 +335,20 @@ class SumCohTraits<double> {
     // Multiply the visibility by antenna 1's g term
     i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN;
     typename Tr::ct model_vis = G_term[i];
-    montblanc::jones_multiply_4x4_in_place<T>(model_vis, polsum);
+    if (NPOL > 1)
+        montblanc::jones_multiply_4x4_in_place<T>(model_vis, polsum);
+    else
+        montblanc::complex_multiply_in_place<T>(model_vis, polsum);
+   
 
     // Multiply the visibility by antenna 2's g term
     i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN;
     typename Tr::ct ant2_g_term = G_term[i];
-    montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(model_vis, ant2_g_term);
+    if (NPOL > 1)
+      montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(model_vis, ant2_g_term);
+    else
+      montblanc::complex_conjugate_multiply_in_place<T>(model_vis, ant2_g_term);
+    
 
     // Compute the chi squared sum terms
     i = (TIME*NBL + BL)*NPOLCHAN + POLCHAN;
@@ -367,28 +390,36 @@ class SumCohTraits<double> {
         obs_vis.y *= w;
     }
 
-    // Now, add the real and imaginary components
-    // of each adjacent group of four polarisations
-    // into the first polarisation.
-    typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2);
-
-    // Add polarisations 2 and 3 to 0 and 1
-    if((POLCHAN & 0x3) < 2)
+    if (NPOL > 1)
     {
-        obs_vis.x += other.x;
-        obs_vis.y += other.y;
-    }
-
-    other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1);
-
-    // If this is the polarisation 0, add polarisation 1
-    // and write out this chi squared sum term
-    if((POLCHAN & 0x3) == 0) {
+       // Now, add the real and imaginary components
+       // of each adjacent group of four polarisations
+       // into the first polarisation.
+       typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2);
+
+       // Add polarisations 2 and 3 to 0 and 1
+       if((POLCHAN & 0x3) < 2)
+       {
+          obs_vis.x += other.x;
+          obs_vis.y += other.y;
+       }
+
+       other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1);
+
+       // If this is the polarisation 0, add polarisation 1
+       // and write out this chi squared sum term
+      if((POLCHAN & 0x3) == 0) {
         obs_vis.x += other.x;
         obs_vis.y += other.y;
 
         i = (TIME*NBL + BL)*NCHAN + (POLCHAN >> 2);
         chi_sqrd_result[i] = obs_vis.x + obs_vis.y;
+      }
+    }
+    else
+    {
+      i = (TIME*NBL + BL)*NCHAN + POLCHAN;
+      chi_sqrd_result[i] = obs_vis.x + obs_vis.y;
     }
 }
 
diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 1a8a2ab7d..6138019e7 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -197,7 +197,13 @@ class SumCohTraits<double> {
       // TODO uses 4 times the actually required space, since
       // we don't need to store a frequency per polarisation
       if(threadIdx.y == 0 && threadIdx.z == 0)
-        { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+      {        
+        if (NPOL >1)
+          { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+        else
+          { shared.freq[threadIdx.x] = frequency[POLCHAN]; }
+      }
+
 
       i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN;
       delta = observed_vis[i];
@@ -278,7 +284,11 @@ class SumCohTraits<double> {
         ant_one.y *= sersic_factor;
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
+       
 
         dev_vis[0].x = ant_one.x*dev_e1;
         dev_vis[0].y = ant_one.y*dev_e1;
@@ -286,7 +296,7 @@ class SumCohTraits<double> {
         dev_vis[1].y = ant_one.y*dev_e2;
         dev_vis[2].x = ant_one.x*dev_scale;
         dev_vis[2].y = ant_one.y*dev_scale;
-   
+  /* 
         for (int p = 0; p < 3; p++)
         {
           // Multiply the visibility derivative by antenna 1's g term
@@ -300,7 +310,7 @@ class SumCohTraits<double> {
           montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant1_g_term, ant2_g_term);    
           dev_vis[p] = ant1_g_term;
         }
- 
+   */
       }   
       // Write partial derivative with respect to sersic parameters
       for (int p=0; p<3; p++)
diff --git a/montblanc/impl/rime/v4/loaders/loaders.py b/montblanc/impl/rime/v4/loaders/loaders.py
index eaabc062e..63b1c933f 100644
--- a/montblanc/impl/rime/v4/loaders/loaders.py
+++ b/montblanc/impl/rime/v4/loaders/loaders.py
@@ -250,4 +250,4 @@ def __enter__(solver):
         return super(MeasurementSetLoader,solver).__enter__()
 
     def __exit__(solver, type, value, traceback):
-        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
\ No newline at end of file
+        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index 5bbb6abd7..47d9b98ee 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -307,6 +307,9 @@ def _gen_vis_slices(self):
         cpu_slice['nvis'] = slice(0, 0, 1)
         gpu_slice['nvis'] = slice(0, 0, 1)
 
+        cpu_slice['npol'] = slice(0, npol, 1)
+        gpu_slice['npol'] = slice(0, npol, 1)
+
         montblanc.log.debug('Generating RIME slices')
 
         # Set up time slicing
@@ -1172,7 +1175,7 @@ def _sync_wait(future):
         X2_sum = self.ft(0.0)
         if self.enable_sersic_grad:
             nssrc = self.dim_local_size('nssrc')
-            X2_grad = self.ft(np.zeros((3,nssrc)))
+            self.X2_grad = self.ft(np.zeros((3,nssrc)))
 
         # Sets of return value futures for each executor
         value_futures = [set() for ex in self.enqueue_executors]
@@ -1246,7 +1249,7 @@ def _accumulate(model_slice, model_chunk):
                         # Get chi-squared and model visibilities (and X2 gradient)
                         if self.enable_sersic_grad:
                             X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                            X2_grad += pinned_X2_grad
+                            self.X2_grad += pinned_X2_grad
                         else:
                             X2, pinned_model_vis, model_vis_idx = f.result()
                         # Sum X2
@@ -1269,7 +1272,7 @@ def _accumulate(model_slice, model_chunk):
             # Get chi-squared and model visibilities (and X2 gradient)
             if self.enable_sersic_grad:
                 X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
-                X2_grad += pinned_X2_grad
+                self.X2_grad += pinned_X2_grad
             else:
                 X2, pinned_model_vis, model_vis_idx = f.result()
             # Sum X2
@@ -1284,7 +1287,7 @@ def _accumulate(model_slice, model_chunk):
         else:
             self.set_X2(X2_sum/self.sigma_sqrd)
             if self.enable_sersic_grad:
-                self.X2_grad = X2_grad/self.sigma_sqrd
+                self.X2_grad = self.X2_grad/self.sigma_sqrd
 
     def shutdown(self):
         """ Shutdown the solver """
diff --git a/montblanc/impl/rime/v5/loaders/loaders.py b/montblanc/impl/rime/v5/loaders/loaders.py
index a35d25848..28545a661 100644
--- a/montblanc/impl/rime/v5/loaders/loaders.py
+++ b/montblanc/impl/rime/v5/loaders/loaders.py
@@ -335,4 +335,4 @@ def __enter__(solver):
         return super(MeasurementSetLoader,solver).__enter__()
 
     def __exit__(solver, type, value, traceback):
-        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
\ No newline at end of file
+        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)

From a3b51fd3f38f0d354202c51ee19beb2f95e6391b Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-gpu.star.ucl.ac.uk>
Date: Thu, 11 May 2017 17:25:49 +0100
Subject: [PATCH 31/33] allowed solver reading context from the slvr_cfg

---
 montblanc/factory.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/montblanc/factory.py b/montblanc/factory.py
index eae4c4b54..3be22fd04 100644
--- a/montblanc/factory.py
+++ b/montblanc/factory.py
@@ -168,7 +168,8 @@ def rime_solver(slvr_cfg):
     elif version == Options.VERSION_FIVE:
         from montblanc.impl.rime.v5.CompositeRimeSolver \
         import CompositeRimeSolver as RimeSolver
-        slvr_cfg[Options.CONTEXT] = __contexts
+        if slvr_cfg.get(Options.CONTEXT, None) is None: 
+            slvr_cfg[Options.CONTEXT] = __contexts
     else:
         raise ValueError('Invalid version %s' % version)
 

From fc46cbb53927be44636aa4c7819e72adc6eb14a9 Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-login.local>
Date: Tue, 19 Sep 2017 13:35:25 +0100
Subject: [PATCH 32/33] bugs fixed: E_beam and G_term matrix default, RimeBSqrt
 function for npol=1

---
 montblanc/impl/rime/v4/config.py        | 4 ++--
 montblanc/impl/rime/v4/gpu/RimeBSqrt.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index bf801d3ed..4c7cb6a3c 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -181,14 +181,14 @@ class Classifier(Enum):
 
     ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 'npol'), 'ct',
         classifiers=frozenset([Classifier.E_BEAM_INPUT]),
-        default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+        default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
             else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
     # Direction-Independent Effects
     ary_dict('G_term', ('ntime', 'na', 'nchan', 'npol'), 'ct',
         classifiers=frozenset([Classifier.COHERENCIES_INPUT]),
-        default=lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+        default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
             else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
diff --git a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
index 1e56313fe..0c6df45f3 100644
--- a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
+++ b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
@@ -169,7 +169,7 @@
 
     // Read in the stokes parameter,
     // multiplying it by the power term
-    typename Tr::ft pol = stokes[i]*power;
+    typename Tr::ft pol = 2.*stokes[i]*power;    // include both the equal terms XX and YY
 
     // Write out the square root of the brightness
     i = (SRC*NTIME + TIME)*NPOLCHAN + POLCHAN;

From 4d06dbdb37d0dfd4f618992e02a14609cf831cff Mon Sep 17 00:00:00 2001
From: Marzia Rivi <mrivi@splinter-login.local>
Date: Fri, 22 Sep 2017 16:39:02 +0100
Subject: [PATCH 33/33] bug fixed for the case npol=1: final reduction didn't
 take into account the case for a single channel where num threadIdx.x = 1 < 3

---
 montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
index 6138019e7..f1389cb01 100644
--- a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -333,10 +333,10 @@ class SumCohTraits<double> {
       __syncthreads();
       //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
       // 3 different threads writes each a different component to avoid serialisation
-      if (threadIdx.x < 3 && threadIdx.y == 0 && threadIdx.z == 0)
+      if (threadIdx.x == 0 && threadIdx.y < 3 && threadIdx.z == 0)
       {
-          i = SRC - SRC_START + threadIdx.x*DEXT(nssrc);
-          atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.x]);
+          i = SRC - SRC_START + threadIdx.y*DEXT(nssrc);
+          atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.y]);
       }
     }