From 4c22fcff13d5994d5df5fb492bd72877c2c41ca8 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Tue, 4 Jun 2019 18:06:33 -0400
Subject: [PATCH 001/100] Sandbox implementation of fill_halo_regions_tiled for
 MPI

Former-commit-id: f9286e6f9e1c1b1f3fb19bb528cbb66728202276
---
 sandbox/tiled_halos.jl | 65 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 sandbox/tiled_halos.jl

diff --git a/sandbox/tiled_halos.jl b/sandbox/tiled_halos.jl
new file mode 100644
index 0000000000..01baeb469e
--- /dev/null
+++ b/sandbox/tiled_halos.jl
@@ -0,0 +1,65 @@
+using Oceananigans
+
+@inline incmod1(a, n) = ifelse(a==n, 1, a + 1)
+@inline decmod1(a, n) = ifelse(a==1, n, a - 1)
+@inline index2rank(I, J, Mx, My) = J*My + I
+
+@inline  east_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
+@inline  west_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
+@inline north_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
+@inline south_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
+
+@inline  east_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
+@inline  west_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
+@inline north_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
+@inline south_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
+
+function fill_halo_regions_tiled!(tiles, Mx, My)
+    for J in 0:My-1, I in 0:Mx-1
+        rank = index2rank(I, J, Mx, My)
+
+        I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
+        J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
+
+        north_rank = index2rank(I,  J⁻, Mx, My)
+        south_rank = index2rank(I,  J⁺, Mx, My)
+        east_rank  = index2rank(I⁺, J,  Mx, My)
+        west_rank  = index2rank(I⁻, J,  Mx, My)
+
+         east_halo(tiles[rank+1]) .=  west_data(tiles[east_rank+1])
+         west_halo(tiles[rank+1]) .=  east_data(tiles[west_rank+1])
+        north_halo(tiles[rank+1]) .= south_data(tiles[north_rank+1])
+        south_halo(tiles[rank+1]) .= north_data(tiles[south_rank+1])
+    end
+end
+
+FT, arch = Float64, CPU()
+
+Nx, Ny, Nz = 16, 16, 16
+Lx, Ly, Lz = 10, 10, 10
+N, L = (Nx, Ny, Nz), (Lx, Ly, Lz)
+
+grid = RegularCartesianGrid(N, L)
+
+# MPI ranks along each dimension
+Mx, My = 2, 2
+
+R = rand(Nx, Ny, Nz)
+
+tiles = []
+for I in 0:Mx-1, J in 0:My-1
+    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
+    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
+    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
+
+    tile = CellField(FT, arch, tile_grid)
+
+    i1, i2 = I*Nx′+1, (I+1)*Nx′
+    j1, j2 = J*Ny′+1, (J+1)*Ny′
+    data(tile) .= R[i1:i2, j1:j2, :]
+
+    push!(tiles, tile)
+end
+
+fill_halo_regions_tiled!(tiles, Mx, My)
+fill_halo_regions_tiled!(tiles, Mx, My)

From f6453eb55f65d7e6fbea225635698ccf1ca251df Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 10:35:38 -0400
Subject: [PATCH 002/100] Fix north/south/east/west indexing.

Former-commit-id: c78d5fa5758a34d35c7554f0a31c96b24e077e49
---
 sandbox/tiled_halos.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sandbox/tiled_halos.jl b/sandbox/tiled_halos.jl
index 01baeb469e..2d6d547a95 100644
--- a/sandbox/tiled_halos.jl
+++ b/sandbox/tiled_halos.jl
@@ -4,15 +4,15 @@ using Oceananigans
 @inline decmod1(a, n) = ifelse(a==1, n, a - 1)
 @inline index2rank(I, J, Mx, My) = J*My + I
 
-@inline  east_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
-@inline  west_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
-@inline north_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
-@inline south_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
-
-@inline  east_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
-@inline  west_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
-@inline north_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
-@inline south_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
+@inline north_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
+@inline south_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
+@inline  west_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
+@inline  east_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
+
+@inline north_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
+@inline south_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
+@inline  west_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
+@inline  east_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
 
 function fill_halo_regions_tiled!(tiles, Mx, My)
     for J in 0:My-1, I in 0:Mx-1

From b8a1f705451d9d032c580cd73715930e115f16fe Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 11:14:30 -0400
Subject: [PATCH 003/100] Tests for correct halo comm.

Former-commit-id: b544d24bd14cb5b2d5ff797592df33b5cc9a695d
---
 sandbox/tiled_halos.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sandbox/tiled_halos.jl b/sandbox/tiled_halos.jl
index 2d6d547a95..e2f90babb4 100644
--- a/sandbox/tiled_halos.jl
+++ b/sandbox/tiled_halos.jl
@@ -1,4 +1,4 @@
-using Oceananigans
+using Oceananigans, Test
 
 @inline incmod1(a, n) = ifelse(a==n, 1, a + 1)
 @inline decmod1(a, n) = ifelse(a==1, n, a - 1)
@@ -63,3 +63,8 @@ end
 
 fill_halo_regions_tiled!(tiles, Mx, My)
 fill_halo_regions_tiled!(tiles, Mx, My)
+
+@test all(tiles[1].data[1:end,     1:end, :] .== R[1:9,   1:9,   :])
+@test all(tiles[2].data[1:end,   0:end-1, :] .== R[1:9,   8:end, :])
+@test all(tiles[3].data[0:end-1,   1:end, :] .== R[8:end, 1:9,   :])
+@test all(tiles[4].data[0:end-1, 0:end-1, :] .== R[8:end, 8:end, :])

From e8632d8532453f329c42efd9bbe00ba5bac48ab5 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 17:52:18 -0400
Subject: [PATCH 004/100] Starting an MPI version of filling halo regions.

Former-commit-id: 50599f8afd695a837b8749c017105ef64b171821
---
 sandbox/tiled_halos_mpi.jl | 62 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 sandbox/tiled_halos_mpi.jl

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
new file mode 100644
index 0000000000..6f241c08fc
--- /dev/null
+++ b/sandbox/tiled_halos_mpi.jl
@@ -0,0 +1,62 @@
+import MPI
+
+using Oceananigans
+
+@inline index2rank(I, J, Mx, My) = J*My + I
+
+function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
+    comm = MPI.COMM_WORLD
+
+    MPI.Barrier(comm)
+
+    rank = MPI.Comm_rank(comm)
+    size = MPI.Comm_size(comm)
+
+    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
+    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
+
+    north_rank = index2rank(I,  J⁻, Mx, My)
+    south_rank = index2rank(I,  J⁺, Mx, My)
+    east_rank  = index2rank(I⁺, J,  Mx, My)
+    west_rank  = index2rank(I⁻, J,  Mx, My)
+
+    send_reqs = []
+    if rank == 0
+        rands = rand(Nx, Ny, Nz)
+
+        for r in 0:Mx*My
+            i1, i2 = I*Nx′+1, (I+1)*Nx′
+            j1, j2 = J*Ny′+1, (J+1)*Ny′
+            send_mesg = R[i1:i2, j1:j2, :]
+
+            tag = 100 + r
+            println("[rank $rank] Sending R[$i1:$i2, $j1:$j2, :] to rank $r with tag $tag...")
+
+            sreq = MPI.Isend(send_mesg, r, tag, comm)
+            push!(send_reqs, sreq)
+        end
+
+        MPI.Waitall!(send_reqs)
+    end
+
+    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
+    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
+    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
+    tile = CellField(FT, arch, tile_grid)
+
+    recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
+
+    tag = 100 + r
+    println("[rank $rank] Receiving message from rank $src with tag $tag...")
+    rreq = MPI.Irecv!(recv_mesg, 0, tag, comm)
+
+    data(tile) .= recv_mesg
+
+    stats = MPI.Waitall!([rreq])
+
+    MPI.Barrier(comm)
+end
+
+MPI.Init()
+fill_halo_regions_mpi!(Float64, CPU(), 16, 16, 16, 2, 2)
+MPI.Finalize()

From ff0f875a943407022a3c5b3480d0db4e31ca88cb Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 18:30:00 -0400
Subject: [PATCH 005/100] Fix typos and stuf

Former-commit-id: 7da2fbaf7f2e4b0a6fae4c890e1a3b91a3c97718
---
 sandbox/tiled_halos_mpi.jl | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 6f241c08fc..e82ad44a27 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -3,8 +3,14 @@ import MPI
 using Oceananigans
 
 @inline index2rank(I, J, Mx, My) = J*My + I
+@inline rank2index(r, Mx, My) = div(r, Mx), mod(r, My)
 
 function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
+    Lx, Ly, Lz = 10, 10, 10
+
+    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
+    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
+    
     comm = MPI.COMM_WORLD
 
     MPI.Barrier(comm)
@@ -12,22 +18,26 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     rank = MPI.Comm_rank(comm)
     size = MPI.Comm_size(comm)
 
+    I, J = rank2index(rank, Mx, My)
     I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
     J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
+    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
+    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
 
     north_rank = index2rank(I,  J⁻, Mx, My)
     south_rank = index2rank(I,  J⁺, Mx, My)
     east_rank  = index2rank(I⁺, J,  Mx, My)
     west_rank  = index2rank(I⁻, J,  Mx, My)
 
-    send_reqs = []
+    send_reqs = MPI.Request[]
     if rank == 0
         rands = rand(Nx, Ny, Nz)
 
-        for r in 0:Mx*My
-            i1, i2 = I*Nx′+1, (I+1)*Nx′
-            j1, j2 = J*Ny′+1, (J+1)*Ny′
-            send_mesg = R[i1:i2, j1:j2, :]
+        for r in 0:Mx*My-1
+            I′, J′ = rank2index(r, Mx, My)
+            i1, i2 = I′*Nx′+1, (I′+1)*Nx′
+            j1, j2 = J′*Ny′+1, (J′+1)*Ny′
+            send_mesg = rands[i1:i2, j1:j2, :]
 
             tag = 100 + r
             println("[rank $rank] Sending R[$i1:$i2, $j1:$j2, :] to rank $r with tag $tag...")
@@ -39,15 +49,13 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
         MPI.Waitall!(send_reqs)
     end
 
-    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
-    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
     tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
     tile = CellField(FT, arch, tile_grid)
 
     recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
 
-    tag = 100 + r
-    println("[rank $rank] Receiving message from rank $src with tag $tag...")
+    tag = 100 + rank
+    println("[rank $rank] Receiving message from rank 0 with tag $tag...")
     rreq = MPI.Irecv!(recv_mesg, 0, tag, comm)
 
     data(tile) .= recv_mesg

From 0defdc9653881990ead849f7fa693d26cbb9e0d0 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 18:51:57 -0400
Subject: [PATCH 006/100] Sending and reciving halo data.

Former-commit-id: 541c389f3d5fd88ba0f2752c71e0ae5ef9cb1636
---
 sandbox/tiled_halos_mpi.jl | 45 +++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index e82ad44a27..23f83d48d0 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -5,12 +5,28 @@ using Oceananigans
 @inline index2rank(I, J, Mx, My) = J*My + I
 @inline rank2index(r, Mx, My) = div(r, Mx), mod(r, My)
 
+@inline north_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
+@inline south_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
+@inline  west_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
+@inline  east_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
+
+@inline north_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
+@inline south_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
+@inline  west_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
+@inline  east_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
+
+@inline distribute_tag(rank) = 100 + rank
+@inline  send_west_tag(rank) = 200 + rank
+@inline  send_east_tag(rank) = 300 + rank
+@inline send_north_tag(rank) = 400 + rank
+@inline send_south_tag(rank) = 500 + rank
+
 function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     Lx, Ly, Lz = 10, 10, 10
 
     Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
     Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
-    
+
     comm = MPI.COMM_WORLD
 
     MPI.Barrier(comm)
@@ -39,10 +55,8 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
             j1, j2 = J′*Ny′+1, (J′+1)*Ny′
             send_mesg = rands[i1:i2, j1:j2, :]
 
-            tag = 100 + r
-            println("[rank $rank] Sending R[$i1:$i2, $j1:$j2, :] to rank $r with tag $tag...")
-
-            sreq = MPI.Isend(send_mesg, r, tag, comm)
+            println("[rank $rank] Sending R[$i1:$i2, $j1:$j2, :] to rank $r...")
+            sreq = MPI.Isend(send_mesg, r, distribute_tag(r), comm)
             push!(send_reqs, sreq)
         end
 
@@ -52,15 +66,26 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
     tile = CellField(FT, arch, tile_grid)
 
+    println("[rank $rank] Receiving message from rank 0...")
     recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
+    rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
 
-    tag = 100 + rank
-    println("[rank $rank] Receiving message from rank 0 with tag $tag...")
-    rreq = MPI.Irecv!(recv_mesg, 0, tag, comm)
-
+    stats = MPI.Waitall!([rreq])
     data(tile) .= recv_mesg
 
-    stats = MPI.Waitall!([rreq])
+    println("[rank $rank] Sending halo data...")
+    se_req = MPI.Isend(west_data(tile),  east_rank,  send_east_tag(rank),  comm)
+    sw_req = MPI.Isend(east_data(tile),  west_rank,  send_west_tag(rank),  comm)
+    sn_req = MPI.Isend(south_data(tile), north_rank, send_north_tag(rank), comm)
+    ss_req = MPI.Isend(north_data(tile), south_rank, send_south_tag(rank), comm)
+
+    MPI.Barrier(comm)
+
+    println("[rank $rank] Receiving halo data...")
+    re_req = MPI.Irecv!(west_halo(tile),  east_rank,  send_west_tag(east_rank),  comm)
+    rw_req = MPI.Irecv!(east_halo(tile),  west_rank,  send_east_tag(west_rank),  comm)
+    rn_req = MPI.Irecv!(south_halo(tile), north_rank, send_south_tag(north_rank), comm)
+    rs_req = MPI.Irecv!(north_halo(tile), south_rank, send_north_tag(south_rank), comm)
 
     MPI.Barrier(comm)
 end

From a75768b7c2bb8a1958713adb224751b99d9bcf17 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 18:57:35 -0400
Subject: [PATCH 007/100] Receiving into contiguous buffers.

Former-commit-id: 8eab25e94d46d0bc1dd3409c69a29defc6af8ea8
---
 sandbox/tiled_halos_mpi.jl | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 23f83d48d0..860baba977 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -81,11 +81,16 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
 
     MPI.Barrier(comm)
 
+     west_buf = zeros(size(west_halo(tile)))
+     east_buf = zeros(size(east_halo(tile)))
+    north_buf = zeros(size(north_halo(tile)))
+    south_buf = zeros(size(south_halo(tile)))
+
     println("[rank $rank] Receiving halo data...")
-    re_req = MPI.Irecv!(west_halo(tile),  east_rank,  send_west_tag(east_rank),  comm)
-    rw_req = MPI.Irecv!(east_halo(tile),  west_rank,  send_east_tag(west_rank),  comm)
-    rn_req = MPI.Irecv!(south_halo(tile), north_rank, send_south_tag(north_rank), comm)
-    rs_req = MPI.Irecv!(north_halo(tile), south_rank, send_north_tag(south_rank), comm)
+    re_req = MPI.Irecv!(west_buf,  east_rank,  send_west_tag(east_rank),  comm)
+    rw_req = MPI.Irecv!(east_buf,  west_rank,  send_east_tag(west_rank),  comm)
+    rn_req = MPI.Irecv!(south_buf, north_rank, send_south_tag(north_rank), comm)
+    rs_req = MPI.Irecv!(north_buf, south_rank, send_north_tag(south_rank), comm)
 
     MPI.Barrier(comm)
 end

From d4718c834261223fb6323d84090c24d6cb6d54a9 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 5 Jun 2019 19:01:42 -0400
Subject: [PATCH 008/100] Sending buffers too...

Former-commit-id: 5435f78c792a4cd62225e24025ff4e0ae526a631
---
 sandbox/tiled_halos_mpi.jl | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 860baba977..f4bf1a194f 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -74,23 +74,29 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     data(tile) .= recv_mesg
 
     println("[rank $rank] Sending halo data...")
-    se_req = MPI.Isend(west_data(tile),  east_rank,  send_east_tag(rank),  comm)
-    sw_req = MPI.Isend(east_data(tile),  west_rank,  send_west_tag(rank),  comm)
-    sn_req = MPI.Isend(south_data(tile), north_rank, send_north_tag(rank), comm)
-    ss_req = MPI.Isend(north_data(tile), south_rank, send_south_tag(rank), comm)
+
+    west_data_buf = zeros(size(west_data(tile)))
+    east_data_buf = zeros(size(east_data(tile)))
+   north_data_buf = zeros(size(north_data(tile)))
+   south_data_buf = zeros(size(south_data(tile)))
+
+    se_req = MPI.Isend(west_data_buf,  east_rank,  send_east_tag(rank),  comm)
+    sw_req = MPI.Isend(east_data_buf,  west_rank,  send_west_tag(rank),  comm)
+    sn_req = MPI.Isend(south_data_buf, north_rank, send_north_tag(rank), comm)
+    ss_req = MPI.Isend(north_data_buf, south_rank, send_south_tag(rank), comm)
 
     MPI.Barrier(comm)
 
-     west_buf = zeros(size(west_halo(tile)))
-     east_buf = zeros(size(east_halo(tile)))
-    north_buf = zeros(size(north_halo(tile)))
-    south_buf = zeros(size(south_halo(tile)))
+     west_halo_buf = zeros(size(west_halo(tile)))
+     east_halo_buf = zeros(size(east_halo(tile)))
+    north_halo_buf = zeros(size(north_halo(tile)))
+    south_halo_buf = zeros(size(south_halo(tile)))
 
     println("[rank $rank] Receiving halo data...")
-    re_req = MPI.Irecv!(west_buf,  east_rank,  send_west_tag(east_rank),  comm)
-    rw_req = MPI.Irecv!(east_buf,  west_rank,  send_east_tag(west_rank),  comm)
-    rn_req = MPI.Irecv!(south_buf, north_rank, send_south_tag(north_rank), comm)
-    rs_req = MPI.Irecv!(north_buf, south_rank, send_north_tag(south_rank), comm)
+    re_req = MPI.Irecv!(west_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
+    rw_req = MPI.Irecv!(east_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
+    rn_req = MPI.Irecv!(south_halo_buf, north_rank, send_south_tag(north_rank), comm)
+    rs_req = MPI.Irecv!(north_halo_buf, south_rank, send_north_tag(south_rank), comm)
 
     MPI.Barrier(comm)
 end

From e4083ee07c31ab8aed1e053a8a374f90a4f5c5bd Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 6 Jun 2019 08:21:38 -0400
Subject: [PATCH 009/100] Woops, actually read from MPI buffer.

Former-commit-id: c1e3375c3f7942065f66f527161fb9bfdc575f6c
---
 sandbox/tiled_halos_mpi.jl | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index f4bf1a194f..d35c5e80ce 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -75,10 +75,10 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
 
     println("[rank $rank] Sending halo data...")
 
-    west_data_buf = zeros(size(west_data(tile)))
-    east_data_buf = zeros(size(east_data(tile)))
-   north_data_buf = zeros(size(north_data(tile)))
-   south_data_buf = zeros(size(south_data(tile)))
+    west_data_buf = copy(west_data(tile))
+    east_data_buf = copy(east_data(tile))
+   north_data_buf = copy(north_data(tile))
+   south_data_buf = copy(south_data(tile))
 
     se_req = MPI.Isend(west_data_buf,  east_rank,  send_east_tag(rank),  comm)
     sw_req = MPI.Isend(east_data_buf,  west_rank,  send_west_tag(rank),  comm)
@@ -99,6 +99,11 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     rs_req = MPI.Irecv!(north_halo_buf, south_rank, send_north_tag(south_rank), comm)
 
     MPI.Barrier(comm)
+
+    east_halo(tile) .=  west_halo_buf
+    west_halo(tile) .=  east_halo_buf
+   north_halo(tile) .= south_halo_buf
+   south_halo(tile) .= north_halo_buf
 end
 
 MPI.Init()

From 95b2edff92f0564c4091763688459b24b690eb5e Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 6 Jun 2019 15:18:06 -0400
Subject: [PATCH 010/100] Halo comm working on 4 ranks

Former-commit-id: bd5cf58b730ff42d5cd87a5283ba196a1bc6da48
---
 sandbox/tiled_halos_mpi.jl | 77 +++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index d35c5e80ce..f8fa566caa 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -3,17 +3,17 @@ import MPI
 using Oceananigans
 
 @inline index2rank(I, J, Mx, My) = J*My + I
-@inline rank2index(r, Mx, My) = div(r, Mx), mod(r, My)
+@inline rank2index(r, Mx, My) = mod(r, Mx), div(r, My)
 
-@inline north_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
-@inline south_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
-@inline  west_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
-@inline  east_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
+@inline north_halo(tile) = @views @inbounds tile.data.parent[1:tile.grid.Hx, :, :]
+@inline south_halo(tile) = @views @inbounds tile.data.parent[tile.grid.Nx+tile.grid.Hx+1:tile.grid.Nx+2tile.grid.Hx, :, :]
+@inline  west_halo(tile) = @views @inbounds tile.data.parent[:, 1:tile.grid.Hy, :]
+@inline  east_halo(tile) = @views @inbounds tile.data.parent[:, tile.grid.Ny+tile.grid.Hy+1:tile.grid.Ny+2tile.grid.Hy, :]
 
-@inline north_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
-@inline south_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
-@inline  west_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
-@inline  east_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
+@inline north_data(tile) = @views @inbounds tile.data.parent[1+tile.grid.Hx:2tile.grid.Hx,   :, :]
+@inline south_data(tile) = @views @inbounds tile.data.parent[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
+@inline  west_data(tile) = @views @inbounds tile.data.parent[:, 1+tile.grid.Hy:2tile.grid.Hy,   :]
+@inline  east_data(tile) = @views @inbounds tile.data.parent[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
 
 @inline distribute_tag(rank) = 100 + rank
 @inline  send_west_tag(rank) = 200 + rank
@@ -32,7 +32,7 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     MPI.Barrier(comm)
 
     rank = MPI.Comm_rank(comm)
-    size = MPI.Comm_size(comm)
+    # size = MPI.Comm_size(comm)
 
     I, J = rank2index(rank, Mx, My)
     I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
@@ -70,22 +70,32 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
     rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
 
-    stats = MPI.Waitall!([rreq])
+    stats = MPI.Wait!(rreq)
     data(tile) .= recv_mesg
 
     println("[rank $rank] Sending halo data...")
 
-    west_data_buf = copy(west_data(tile))
-    east_data_buf = copy(east_data(tile))
-   north_data_buf = copy(north_data(tile))
-   south_data_buf = copy(south_data(tile))
+     west_data_buf = zeros(size(west_data(tile)))
+     east_data_buf = zeros(size(east_data(tile)))
+    north_data_buf = zeros(size(north_data(tile)))
+    south_data_buf = zeros(size(south_data(tile)))
 
-    se_req = MPI.Isend(west_data_buf,  east_rank,  send_east_tag(rank),  comm)
-    sw_req = MPI.Isend(east_data_buf,  west_rank,  send_west_tag(rank),  comm)
-    sn_req = MPI.Isend(south_data_buf, north_rank, send_north_tag(rank), comm)
-    ss_req = MPI.Isend(north_data_buf, south_rank, send_south_tag(rank), comm)
+     west_data_buf .= copy(west_data(tile))
+     east_data_buf .= copy(east_data(tile))
+    north_data_buf .= copy(north_data(tile))
+    south_data_buf .= copy(south_data(tile))
 
-    MPI.Barrier(comm)
+    se_req = MPI.Isend(east_data_buf,  east_rank,  send_east_tag(rank),  comm)
+    sw_req = MPI.Isend(west_data_buf,  west_rank,  send_west_tag(rank),  comm)
+    sn_req = MPI.Isend(north_data_buf, north_rank, send_north_tag(rank), comm)
+    ss_req = MPI.Isend(south_data_buf, south_rank, send_south_tag(rank), comm)
+
+    @debug "[rank $rank] sending #$(send_east_tag(rank)) to rank $east_rank"
+    @debug "[rank $rank] sending #$(send_west_tag(rank)) to rank $west_rank"
+    @debug "[rank $rank] sending #$(send_north_tag(rank)) to rank $north_rank"
+    @debug "[rank $rank] sending #$(send_south_tag(rank)) to rank $south_rank"
+
+    MPI.Waitall!([se_req, sw_req, sn_req, ss_req])
 
      west_halo_buf = zeros(size(west_halo(tile)))
      east_halo_buf = zeros(size(east_halo(tile)))
@@ -93,17 +103,22 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     south_halo_buf = zeros(size(south_halo(tile)))
 
     println("[rank $rank] Receiving halo data...")
-    re_req = MPI.Irecv!(west_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
-    rw_req = MPI.Irecv!(east_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
-    rn_req = MPI.Irecv!(south_halo_buf, north_rank, send_south_tag(north_rank), comm)
-    rs_req = MPI.Irecv!(north_halo_buf, south_rank, send_north_tag(south_rank), comm)
-
-    MPI.Barrier(comm)
-
-    east_halo(tile) .=  west_halo_buf
-    west_halo(tile) .=  east_halo_buf
-   north_halo(tile) .= south_halo_buf
-   south_halo(tile) .= north_halo_buf
+    re_req = MPI.Irecv!(west_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
+    rw_req = MPI.Irecv!(east_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
+    rn_req = MPI.Irecv!(south_halo_buf, south_rank, send_north_tag(south_rank), comm)
+    rs_req = MPI.Irecv!(north_halo_buf, north_rank, send_south_tag(north_rank), comm)
+
+    @debug "[rank $rank] waiting for #$(send_east_tag(west_rank)) from rank $west_rank..."
+    @debug "[rank $rank] waiting for #$(send_west_tag(east_rank)) from rank $east_rank..."
+    @debug "[rank $rank] waiting for #$(send_north_tag(south_rank)) from rank $south_rank..."
+    @debug "[rank $rank] waiting for #$(send_south_tag(north_rank)) from rank $north_rank..."
+
+    MPI.Waitall!([re_req, rw_req, rn_req, rs_req])
+
+    east_halo(tile) .=  east_halo_buf
+    west_halo(tile) .=  west_halo_buf
+   north_halo(tile) .= north_halo_buf
+   south_halo(tile) .= south_halo_buf
 end
 
 MPI.Init()

From 800cb8008c122100056f0747d98cb959c2f0fddc Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 6 Jun 2019 15:28:28 -0400
Subject: [PATCH 011/100] Nice send and receive halo functions

Former-commit-id: 971a4f646b080b1b34f0d6e2b591acbd9a1b5b77
---
 sandbox/tiled_halos_mpi.jl | 113 +++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 42 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index f8fa566caa..d3d5aad97a 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -21,6 +21,67 @@ using Oceananigans
 @inline send_north_tag(rank) = 400 + rank
 @inline send_south_tag(rank) = 500 + rank
 
+function send_halo_data(tile)
+    rank = MPI.Comm_rank(comm)
+
+    I, J = rank2index(rank, Mx, My)
+    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
+    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
+    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
+    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
+
+    north_rank = index2rank(I,  J⁻, Mx, My)
+    south_rank = index2rank(I,  J⁺, Mx, My)
+    east_rank  = index2rank(I⁺, J,  Mx, My)
+    west_rank  = index2rank(I⁻, J,  Mx, My)
+
+    west_data_buf = zeros(size(west_data(tile)))
+    east_data_buf = zeros(size(east_data(tile)))
+   north_data_buf = zeros(size(north_data(tile)))
+   south_data_buf = zeros(size(south_data(tile)))
+
+    west_data_buf .= copy(west_data(tile))
+    east_data_buf .= copy(east_data(tile))
+   north_data_buf .= copy(north_data(tile))
+   south_data_buf .= copy(south_data(tile))
+
+   se_req = MPI.Isend(east_data_buf,  east_rank,  send_east_tag(rank),  comm)
+   sw_req = MPI.Isend(west_data_buf,  west_rank,  send_west_tag(rank),  comm)
+   sn_req = MPI.Isend(north_data_buf, north_rank, send_north_tag(rank), comm)
+   ss_req = MPI.Isend(south_data_buf, south_rank, send_south_tag(rank), comm)
+
+   @debug "[rank $rank] sending #$(send_east_tag(rank)) to rank $east_rank"
+   @debug "[rank $rank] sending #$(send_west_tag(rank)) to rank $west_rank"
+   @debug "[rank $rank] sending #$(send_north_tag(rank)) to rank $north_rank"
+   @debug "[rank $rank] sending #$(send_south_tag(rank)) to rank $south_rank"
+
+   MPI.Waitall!([se_req, sw_req, sn_req, ss_req])
+end
+
+function receive_halo_data(tile)
+    west_halo_buf = zeros(size(west_halo(tile)))
+    east_halo_buf = zeros(size(east_halo(tile)))
+   north_halo_buf = zeros(size(north_halo(tile)))
+   south_halo_buf = zeros(size(south_halo(tile)))
+
+   re_req = MPI.Irecv!(west_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
+   rw_req = MPI.Irecv!(east_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
+   rn_req = MPI.Irecv!(south_halo_buf, south_rank, send_north_tag(south_rank), comm)
+   rs_req = MPI.Irecv!(north_halo_buf, north_rank, send_south_tag(north_rank), comm)
+
+   @debug "[rank $rank] waiting for #$(send_east_tag(west_rank)) from rank $west_rank..."
+   @debug "[rank $rank] waiting for #$(send_west_tag(east_rank)) from rank $east_rank..."
+   @debug "[rank $rank] waiting for #$(send_north_tag(south_rank)) from rank $south_rank..."
+   @debug "[rank $rank] waiting for #$(send_south_tag(north_rank)) from rank $north_rank..."
+
+   MPI.Waitall!([re_req, rw_req, rn_req, rs_req])
+
+    east_halo(tile) .=  east_halo_buf
+    west_halo(tile) .=  west_halo_buf
+   north_halo(tile) .= north_halo_buf
+   south_halo(tile) .= south_halo_buf
+end
+
 function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     Lx, Ly, Lz = 10, 10, 10
 
@@ -32,7 +93,6 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     MPI.Barrier(comm)
 
     rank = MPI.Comm_rank(comm)
-    # size = MPI.Comm_size(comm)
 
     I, J = rank2index(rank, Mx, My)
     I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
@@ -66,7 +126,7 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
     tile = CellField(FT, arch, tile_grid)
 
-    println("[rank $rank] Receiving message from rank 0...")
+    println("[rank $rank] Receiving tile from rank 0...")
     recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
     rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
 
@@ -74,51 +134,20 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     data(tile) .= recv_mesg
 
     println("[rank $rank] Sending halo data...")
-
-     west_data_buf = zeros(size(west_data(tile)))
-     east_data_buf = zeros(size(east_data(tile)))
-    north_data_buf = zeros(size(north_data(tile)))
-    south_data_buf = zeros(size(south_data(tile)))
-
-     west_data_buf .= copy(west_data(tile))
-     east_data_buf .= copy(east_data(tile))
-    north_data_buf .= copy(north_data(tile))
-    south_data_buf .= copy(south_data(tile))
-
-    se_req = MPI.Isend(east_data_buf,  east_rank,  send_east_tag(rank),  comm)
-    sw_req = MPI.Isend(west_data_buf,  west_rank,  send_west_tag(rank),  comm)
-    sn_req = MPI.Isend(north_data_buf, north_rank, send_north_tag(rank), comm)
-    ss_req = MPI.Isend(south_data_buf, south_rank, send_south_tag(rank), comm)
-
-    @debug "[rank $rank] sending #$(send_east_tag(rank)) to rank $east_rank"
-    @debug "[rank $rank] sending #$(send_west_tag(rank)) to rank $west_rank"
-    @debug "[rank $rank] sending #$(send_north_tag(rank)) to rank $north_rank"
-    @debug "[rank $rank] sending #$(send_south_tag(rank)) to rank $south_rank"
-
-    MPI.Waitall!([se_req, sw_req, sn_req, ss_req])
-
-     west_halo_buf = zeros(size(west_halo(tile)))
-     east_halo_buf = zeros(size(east_halo(tile)))
-    north_halo_buf = zeros(size(north_halo(tile)))
-    south_halo_buf = zeros(size(south_halo(tile)))
+    send_halo_data(tile)
 
     println("[rank $rank] Receiving halo data...")
-    re_req = MPI.Irecv!(west_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
-    rw_req = MPI.Irecv!(east_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
-    rn_req = MPI.Irecv!(south_halo_buf, south_rank, send_north_tag(south_rank), comm)
-    rs_req = MPI.Irecv!(north_halo_buf, north_rank, send_south_tag(north_rank), comm)
+    receive_halo_data(tile)
 
-    @debug "[rank $rank] waiting for #$(send_east_tag(west_rank)) from rank $west_rank..."
-    @debug "[rank $rank] waiting for #$(send_west_tag(east_rank)) from rank $east_rank..."
-    @debug "[rank $rank] waiting for #$(send_north_tag(south_rank)) from rank $south_rank..."
-    @debug "[rank $rank] waiting for #$(send_south_tag(north_rank)) from rank $north_rank..."
+    println("[rank $rank] Sending halo data...")
+    send_halo_data(tile)
 
-    MPI.Waitall!([re_req, rw_req, rn_req, rs_req])
+    println("[rank $rank] Receiving halo data...")
+    receive_halo_data(tile)
 
-    east_halo(tile) .=  east_halo_buf
-    west_halo(tile) .=  west_halo_buf
-   north_halo(tile) .= north_halo_buf
-   south_halo(tile) .= south_halo_buf
+    if rank == 3
+        display(tile.data)
+    end
 end
 
 MPI.Init()

From 1e7a910d857d30854be4be56ad53dcadd8ea8f1d Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 6 Jun 2019 15:34:37 -0400
Subject: [PATCH 012/100] Fix nice functions

Former-commit-id: f7f4c2279954b2daeee53f20262fc02c62835adc
---
 sandbox/tiled_halos_mpi.jl | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index d3d5aad97a..83eaa9b648 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -21,14 +21,13 @@ using Oceananigans
 @inline send_north_tag(rank) = 400 + rank
 @inline send_south_tag(rank) = 500 + rank
 
-function send_halo_data(tile)
+function send_halo_data(tile, Mx, My)
+    comm = MPI.COMM_WORLD
     rank = MPI.Comm_rank(comm)
 
     I, J = rank2index(rank, Mx, My)
     I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
     J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
-    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
-    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
 
     north_rank = index2rank(I,  J⁻, Mx, My)
     south_rank = index2rank(I,  J⁺, Mx, My)
@@ -58,7 +57,19 @@ function send_halo_data(tile)
    MPI.Waitall!([se_req, sw_req, sn_req, ss_req])
 end
 
-function receive_halo_data(tile)
+function receive_halo_data(tile, Mx, My)
+    comm = MPI.COMM_WORLD
+    rank = MPI.Comm_rank(comm)
+
+    I, J = rank2index(rank, Mx, My)
+    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
+    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
+
+    north_rank = index2rank(I,  J⁻, Mx, My)
+    south_rank = index2rank(I,  J⁺, Mx, My)
+    east_rank  = index2rank(I⁺, J,  Mx, My)
+    west_rank  = index2rank(I⁻, J,  Mx, My)
+    
     west_halo_buf = zeros(size(west_halo(tile)))
     east_halo_buf = zeros(size(east_halo(tile)))
    north_halo_buf = zeros(size(north_halo(tile)))
@@ -134,16 +145,16 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     data(tile) .= recv_mesg
 
     println("[rank $rank] Sending halo data...")
-    send_halo_data(tile)
+    send_halo_data(tile, Mx, My)
 
     println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile)
+    receive_halo_data(tile, Mx, My)
 
     println("[rank $rank] Sending halo data...")
-    send_halo_data(tile)
+    send_halo_data(tile, Mx, My)
 
     println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile)
+    receive_halo_data(tile, Mx, My)
 
     if rank == 3
         display(tile.data)

From d455bd9a1638df04ee01540bc98587843e16b2fa Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 6 Jun 2019 18:02:01 -0400
Subject: [PATCH 013/100] Less MPI issues, bigger problem

Former-commit-id: 971d12a9bada3d630414f91e46e1738662384b42
---
 sandbox/tiled_halos_mpi.jl | 101 +++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 27 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 83eaa9b648..15f433c94c 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -1,7 +1,41 @@
+using Printf
+
 import MPI
 
 using Oceananigans
 
+# Source: https://github.com/JuliaCI/BenchmarkTools.jl/blob/master/src/trials.jl
+function prettytime(t)
+    if t < 1e3
+        value, units = t, "ns"
+    elseif t < 1e6
+        value, units = t / 1e3, "μs"
+    elseif t < 1e9
+        value, units = t / 1e6, "ms"
+    else
+        s = t / 1e9
+        if s < 60
+            value, units = s, "s"
+        else
+            value, units = (s / 60), "min"
+        end
+    end
+    return string(@sprintf("%.3f", value), " ", units)
+end
+
+function prettybandwidth(b)
+    if b < 1024
+        val, units = b, "B/s"
+    elseif b < 1024^2
+        val, units = b / 1024, "KiB/s"
+    elseif b < 1024^3
+        val, units = b / 1024^2, "MiB/s"
+    else
+        val, units = b / 1024^3, "GiB/s"
+    end
+    return string(@sprintf("%.3f", val), " ", units)
+end
+
 @inline index2rank(I, J, Mx, My) = J*My + I
 @inline rank2index(r, Mx, My) = mod(r, Mx), div(r, My)
 
@@ -21,8 +55,7 @@ using Oceananigans
 @inline send_north_tag(rank) = 400 + rank
 @inline send_south_tag(rank) = 500 + rank
 
-function send_halo_data(tile, Mx, My)
-    comm = MPI.COMM_WORLD
+function send_halo_data(tile, Mx, My, comm)
     rank = MPI.Comm_rank(comm)
 
     I, J = rank2index(rank, Mx, My)
@@ -53,12 +86,9 @@ function send_halo_data(tile, Mx, My)
    @debug "[rank $rank] sending #$(send_west_tag(rank)) to rank $west_rank"
    @debug "[rank $rank] sending #$(send_north_tag(rank)) to rank $north_rank"
    @debug "[rank $rank] sending #$(send_south_tag(rank)) to rank $south_rank"
-
-   MPI.Waitall!([se_req, sw_req, sn_req, ss_req])
 end
 
-function receive_halo_data(tile, Mx, My)
-    comm = MPI.COMM_WORLD
+function receive_halo_data(tile, Mx, My, comm)
     rank = MPI.Comm_rank(comm)
 
     I, J = rank2index(rank, Mx, My)
@@ -100,10 +130,11 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
 
     comm = MPI.COMM_WORLD
-
+    
     MPI.Barrier(comm)
 
     rank = MPI.Comm_rank(comm)
+       R = MPI.Comm_size(comm)
 
     I, J = rank2index(rank, Mx, My)
     I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
@@ -116,51 +147,67 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
     east_rank  = index2rank(I⁺, J,  Mx, My)
     west_rank  = index2rank(I⁻, J,  Mx, My)
 
+    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
+    tile = CellField(FT, arch, tile_grid)
+    
     send_reqs = MPI.Request[]
     if rank == 0
         rands = rand(Nx, Ny, Nz)
 
-        for r in 0:Mx*My-1
+        for r in 1:Mx*My-1
             I′, J′ = rank2index(r, Mx, My)
             i1, i2 = I′*Nx′+1, (I′+1)*Nx′
             j1, j2 = J′*Ny′+1, (J′+1)*Ny′
             send_mesg = rands[i1:i2, j1:j2, :]
 
-            println("[rank $rank] Sending R[$i1:$i2, $j1:$j2, :] to rank $r...")
+            println("[rank $rank] Sending rands[$i1:$i2, $j1:$j2, :] to rank $r...")
             sreq = MPI.Isend(send_mesg, r, distribute_tag(r), comm)
             push!(send_reqs, sreq)
         end
 
+        data(tile) .= rands[1:Nx′, 1:Ny′, :]
+
         MPI.Waitall!(send_reqs)
     end
 
-    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
-    tile = CellField(FT, arch, tile_grid)
-
-    println("[rank $rank] Receiving tile from rank 0...")
-    recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
-    rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
-
-    stats = MPI.Wait!(rreq)
-    data(tile) .= recv_mesg
+    if rank != 0
+        println("[rank $rank] Receiving tile from rank 0...")
+        recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
+        rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
 
+        stats = MPI.Wait!(rreq)
+        data(tile) .= recv_mesg
+    end
+    
     println("[rank $rank] Sending halo data...")
-    send_halo_data(tile, Mx, My)
+    send_halo_data(tile, Mx, My, comm)
 
     println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile, Mx, My)
+    receive_halo_data(tile, Mx, My, comm)
+    
+    MPI.Barrier(comm)
+    if rank == 0
+        tic = time_ns() 
+    end
 
     println("[rank $rank] Sending halo data...")
-    send_halo_data(tile, Mx, My)
+    send_halo_data(tile, Mx, My, comm)
 
     println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile, Mx, My)
-
-    if rank == 3
-        display(tile.data)
-    end
+    receive_halo_data(tile, Mx, My, comm)
+
+	MPI.Barrier(comm)
+	if rank == 0
+		t = (time_ns() - tic)
+		ts = t / 1e9
+        @info "$R ranks halo communication time: $(prettytime(t))"
+        
+        Hx, Hy = 1, 1
+        data_size = sizeof(FT) * 2Nz*(Hx*Nx + Hy*Ny)
+        @info "$R ranks halo communication bandwidth: $(prettybandwidth(data_size/ts))"
+	end
 end
 
 MPI.Init()
-fill_halo_regions_mpi!(Float64, CPU(), 16, 16, 16, 2, 2)
+fill_halo_regions_mpi!(Float64, CPU(), 512, 512, 512, 2, 2)
 MPI.Finalize()

From 9c0748ffb074b65483dd1534c1f8a09d28ddab10 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 7 Jun 2019 08:44:14 -0400
Subject: [PATCH 014/100] Need CuArrays for broadcast

Former-commit-id: e38f3b8821ec317fc59546931d1a5cfceeaab52f
---
 sandbox/tiled_halos_mpi.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 15f433c94c..04bc1d9356 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -1,5 +1,6 @@
 using Printf
 
+using CuArrays
 import MPI
 
 using Oceananigans
@@ -117,10 +118,10 @@ function receive_halo_data(tile, Mx, My, comm)
 
    MPI.Waitall!([re_req, rw_req, rn_req, rs_req])
 
-    east_halo(tile) .=  east_halo_buf
-    west_halo(tile) .=  west_halo_buf
-   north_halo(tile) .= north_halo_buf
-   south_halo(tile) .= south_halo_buf
+     east_halo(tile) .= CuArray(east_halo_buf)
+     west_halo(tile) .= CuArray(west_halo_buf)
+    north_halo(tile) .= CuArray(north_halo_buf)
+    south_halo(tile) .= CuArray(south_halo_buf)
 end
 
 function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
@@ -209,5 +210,5 @@ function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
 end
 
 MPI.Init()
-fill_halo_regions_mpi!(Float64, CPU(), 512, 512, 512, 2, 2)
+fill_halo_regions_mpi!(Float64, GPU(), 192, 192, 192, 3, 3)
 MPI.Finalize()

From 14e6a8431a2299b4ce1da61d6b66b354869c0e24 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 7 Jun 2019 09:43:43 -0400
Subject: [PATCH 015/100] note to self

Former-commit-id: 357c2c94f0777cd8f4e87179c42c358ec62ec2ce
---
 sandbox/tiled_halos_mpi.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
index 04bc1d9356..4a01425482 100644
--- a/sandbox/tiled_halos_mpi.jl
+++ b/sandbox/tiled_halos_mpi.jl
@@ -68,6 +68,7 @@ function send_halo_data(tile, Mx, My, comm)
     east_rank  = index2rank(I⁺, J,  Mx, My)
     west_rank  = index2rank(I⁻, J,  Mx, My)
 
+    # cuzeros doesn't exist anymore. Use similar!
     west_data_buf = zeros(size(west_data(tile)))
     east_data_buf = zeros(size(east_data(tile)))
    north_data_buf = zeros(size(north_data(tile)))

From d337ebfd7b2c9e890c6c1d6689db495a677a247f Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 22:46:55 -0500
Subject: [PATCH 016/100] Start prototyping a `DistributedModel` type.

---
 src/distributed_model.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 src/distributed_model.jl

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
new file mode 100644
index 0000000000..711893c165
--- /dev/null
+++ b/src/distributed_model.jl
@@ -0,0 +1,19 @@
+import MPI
+
+using Oceananigans
+
+struct DistributedModel{A, R, G, C}
+                 ranks :: R
+                models :: A
+    connectivity_graph :: G
+              MPI_Comm :: C
+end
+
+const FieldBoundaryConditions = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+
+function validate_tupled_argument(arg, argtype, argname)
+    length(arg) == 3        || throw(ArgumentError("length($argname) must be 3."))
+    all(isa.(arg, argtype)) || throw(ArgumentError("$argname=$arg must contain $argtype s."))
+    all(arg .> 0)           || throw(ArgumentError("Elements of $argname=$arg must be > 0!"))
+    return nothing
+end

From 7ca306ce786486ec656586ece5cdab29ca62ae5e Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 22:47:15 -0500
Subject: [PATCH 017/100] `DistributedModel` constructor that checks for
 consistent ranks

---
 src/distributed_model.jl | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 711893c165..6272e23d85 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -17,3 +17,34 @@ function validate_tupled_argument(arg, argtype, argname)
     all(arg .> 0)           || throw(ArgumentError("Elements of $argname=$arg must be > 0!"))
     return nothing
 end
+
+function DistributedModel(; ranks, model_kwargs...)
+    validate_tupled_argument(ranks, Int, "ranks")
+    Rx, Ry, Rz = ranks
+    total_ranks = Rx*Ry*Rz
+
+    MPI.Init()
+    comm = MPI.COMM_WORLD
+
+    mpi_ranks = MPI.Comm_size(comm)
+    my_rank   = MPI.Comm_rank(comm)
+
+    if my_rank == 0
+        if total_ranks != mpi_ranks
+            throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
+                                "with number of MPI ranks: $mpi_ranks. Exiting with code 1."))
+            MPI.Finalize()
+            exit(code=1)
+        end
+    end
+
+    # Ensure that ranks 1:N don't go ahead if total_ranks != mpi_ranks.
+    MPI.Barrier(comm)
+
+    model_id = my_rank + 1
+    println("Model #$my_rank reporting in")
+
+    return DistributedModel(ranks, nothing, nothing, comm)
+end
+
+dm = DistributedModel(ranks=(2, 2, 1))

From e16190e7953ece023793d6118665d7e9a728359b Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 23:03:46 -0500
Subject: [PATCH 018/100] Utility functions for converting between MPI rank and
 3D index.

---
 src/distributed_model.jl | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 6272e23d85..b49ab8fbb1 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -9,7 +9,17 @@ struct DistributedModel{A, R, G, C}
               MPI_Comm :: C
 end
 
-const FieldBoundaryConditions = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+
+@inline index2rank(i, j, k, Rx, Ry, Rz) = k*Rx*Ry + j*Rx + i
+
+@inline function rank2index(r, Rx, Ry, Rz)
+    k = div(r, Rx*Ry)
+    r -= k*Rx*Ry
+    j = div(r, Rx)
+    i = mod(r, Rx)
+    return i, j, k
+end
 
 function validate_tupled_argument(arg, argtype, argname)
     length(arg) == 3        || throw(ArgumentError("length($argname) must be 3."))

From aff034bd8ffa45c372ec8df903f2eefbad386a2b Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 23:06:41 -0500
Subject: [PATCH 019/100] Make z the fast index and convert to 1-based indexing

---
 src/distributed_model.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index b49ab8fbb1..cc407aece8 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -11,14 +11,14 @@ end
 
 const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
 
-@inline index2rank(i, j, k, Rx, Ry, Rz) = k*Rx*Ry + j*Rx + i
+@inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
 
 @inline function rank2index(r, Rx, Ry, Rz)
-    k = div(r, Rx*Ry)
-    r -= k*Rx*Ry
-    j = div(r, Rx)
-    i = mod(r, Rx)
-    return i, j, k
+    i = div(r, Ry*Rz)
+    r -= i*Ry*Rz
+    j = div(r, Rz)
+    k = mod(r, Rz)
+    return i+1, j+1, k+1
 end
 
 function validate_tupled_argument(arg, argtype, argname)

From b7897225466f6d0f74e39b3d912b15da9a8b647b Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 23:33:57 -0500
Subject: [PATCH 020/100] Add size and length kwargs, and create proper grid
 for each rank

---
 src/distributed_model.jl | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index cc407aece8..005588d539 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -2,6 +2,8 @@ import MPI
 
 using Oceananigans
 
+using Oceananigans.Grids: validate_tupled_argument
+
 struct DistributedModel{A, R, G, C}
                  ranks :: R
                 models :: A
@@ -21,15 +23,14 @@ const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom
     return i+1, j+1, k+1
 end
 
-function validate_tupled_argument(arg, argtype, argname)
-    length(arg) == 3        || throw(ArgumentError("length($argname) must be 3."))
-    all(isa.(arg, argtype)) || throw(ArgumentError("$argname=$arg must contain $argtype s."))
-    all(arg .> 0)           || throw(ArgumentError("Elements of $argname=$arg must be > 0!"))
-    return nothing
-end
-
-function DistributedModel(; ranks, model_kwargs...)
+function DistributedModel(; size, length, ranks, model_kwargs...)
+    validate_tupled_argument(ranks, Int, "size")
+    validate_tupled_argument(ranks, Number, "length")
     validate_tupled_argument(ranks, Int, "ranks")
+
+    Nx, Ny, Nz = size
+    Lx, Ly, Lz = length
+
     Rx, Ry, Rz = ranks
     total_ranks = Rx*Ry*Rz
 
@@ -52,9 +53,22 @@ function DistributedModel(; ranks, model_kwargs...)
     MPI.Barrier(comm)
 
     model_id = my_rank + 1
-    println("Model #$my_rank reporting in")
+    index = rank2index(my_rank, Rx, Ry, Rz)
+    rr = index2rank(index..., Rx, Ry, Rz)
+    # @info "rank=$my_rank, index=$index, index2rank=$rr"
+
+    i, j, k = rank2index(my_rank, Rx, Ry, Rz)
+    nx, ny, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
+    lx, ly, lz = Lx/Rx, Ly/Ry, Lz/Rz
+    x₁, x₂ = (i-1)*lx, i*lx
+    y₁, y₂ = (j-1)*ly, j*ly
+    z₁, z₂ = (k-1)*lz, k*lz
+    @info "rank=$my_rank, x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
+    grid = RegularCartesianGrid(size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
     return DistributedModel(ranks, nothing, nothing, comm)
 end
 
-dm = DistributedModel(ranks=(2, 2, 1))
+dm = DistributedModel(size=(32, 32, 32), length=(1, 2, 5), ranks=(2, 2, 2))
+
+MPI.Finalize()

From 2540596d4f728164c42e61fa8842f0a9f2703204 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 14 Dec 2019 23:42:13 -0500
Subject: [PATCH 021/100] More general left/right endpoints for grid

---
 src/distributed_model.jl | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 005588d539..390a16d8df 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -23,13 +23,16 @@ const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom
     return i+1, j+1, k+1
 end
 
-function DistributedModel(; size, length, ranks, model_kwargs...)
+function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     validate_tupled_argument(ranks, Int, "size")
-    validate_tupled_argument(ranks, Number, "length")
     validate_tupled_argument(ranks, Int, "ranks")
 
     Nx, Ny, Nz = size
-    Lx, Ly, Lz = length
+
+    xL, xR = x
+    yL, yR = y
+    zL, zR = z
+    Lx, Ly, Lz = xR-xL, yR-yL, zR-zL
 
     Rx, Ry, Rz = ranks
     total_ranks = Rx*Ry*Rz
@@ -55,20 +58,26 @@ function DistributedModel(; size, length, ranks, model_kwargs...)
     model_id = my_rank + 1
     index = rank2index(my_rank, Rx, Ry, Rz)
     rr = index2rank(index..., Rx, Ry, Rz)
-    # @info "rank=$my_rank, index=$index, index2rank=$rr"
+    @info "rank=$my_rank, index=$index, index2rank=$rr"
+
+    MPI.Barrier(comm)
 
     i, j, k = rank2index(my_rank, Rx, Ry, Rz)
     nx, ny, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
     lx, ly, lz = Lx/Rx, Ly/Ry, Lz/Rz
-    x₁, x₂ = (i-1)*lx, i*lx
-    y₁, y₂ = (j-1)*ly, j*ly
-    z₁, z₂ = (k-1)*lz, k*lz
+
+    x₁, x₂ = xL + (i-1)*lx, xL + i*lx
+    y₁, y₂ = yL + (j-1)*ly, yL + j*ly
+    z₁, z₂ = zL + (k-1)*lz, zL + k*lz
+
     @info "rank=$my_rank, x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
     grid = RegularCartesianGrid(size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
+    MPI.Barrier(comm)
+
     return DistributedModel(ranks, nothing, nothing, comm)
 end
 
-dm = DistributedModel(size=(32, 32, 32), length=(1, 2, 5), ranks=(2, 2, 2))
+dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
 
 MPI.Finalize()

From d446d12f20e6c524bb53b079200f91b2d99acffd Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 00:06:01 -0500
Subject: [PATCH 022/100] Compute connectivity graph assuming box model and
 brick MPI topology

The connectivity graph needs to take into account periodic boundary
conditions.
---
 src/distributed_model.jl | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 390a16d8df..7fc476881f 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -75,6 +75,33 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
 
     MPI.Barrier(comm)
 
+    i_east = i+1 > Rx ? nothing : i+1
+    i_west = i-1 < 1  ? nothing : i-1
+
+    j_north = j+1 > Ry ? nothing : j+1
+    j_south = j-1 < 1  ? nothing : j-1
+
+    k_top = k+1 > Rz ? nothing : k+1
+    k_bot = k-1 < 1  ? nothing : k-1
+
+    r_east = isnothing(i_east) ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
+    r_west = isnothing(i_west) ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+
+    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
+    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
+
+    r_top = isnothing(k_top) ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
+    r_bot = isnothing(k_bot) ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+
+    @info "rank=$my_rank, index=$index, i_east=$i_east, i_west=$i_west, j_north=$j_north, j_south=$j_south, k_top=$k_top, k_bot=$k_bot"
+    @info "rank=$my_rank,                  r_east=$r_east, r_west=$r_west, r_north=$r_north, r_south=$r_south, r_top=$r_top, r_bot=$r_bot"
+
+    MPI.Barrier(comm)
+
+    my_connectivity = (east=r_east, west=r_west,
+                       north=r_north, south=r_south,
+                       top=r_top, bottom=r_bot)
+
     return DistributedModel(ranks, nothing, nothing, comm)
 end
 

From b56b13c048834e5ca493b1617804c2286cecce89 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 00:39:01 -0500
Subject: [PATCH 023/100] Communication boundary condition type for MPI

---
 src/distributed_model.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 7fc476881f..3f10a25e07 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -2,6 +2,7 @@ import MPI
 
 using Oceananigans
 
+using Oceananigans: BCType
 using Oceananigans.Grids: validate_tupled_argument
 
 struct DistributedModel{A, R, G, C}
@@ -11,6 +12,8 @@ struct DistributedModel{A, R, G, C}
               MPI_Comm :: C
 end
 
+struct Communication <: BCType end
+
 const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
 
 @inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)

From 117853dbce398ef48522e716378ddb50245561ac Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 00:40:56 -0500
Subject: [PATCH 024/100] Root process should gather connectivities from each
 rank

---
 src/distributed_model.jl | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 3f10a25e07..d5b4608496 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -99,15 +99,21 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     @info "rank=$my_rank, index=$index, i_east=$i_east, i_west=$i_west, j_north=$j_north, j_south=$j_south, k_top=$k_top, k_bot=$k_bot"
     @info "rank=$my_rank,                  r_east=$r_east, r_west=$r_west, r_north=$r_north, r_south=$r_south, r_top=$r_top, r_bot=$r_bot"
 
-    MPI.Barrier(comm)
-
     my_connectivity = (east=r_east, west=r_west,
                        north=r_north, south=r_south,
                        top=r_top, bottom=r_bot)
 
-    return DistributedModel(ranks, nothing, nothing, comm)
+    MPI.Barrier(comm)
+
+    connectivity_graph = MPI.Gather(my_connectivity, 0, comm)
+    
+    dm = DistributedModel(ranks, nothing, connectivity_graph, comm)
+
+    return dm
 end
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
 
-MPI.Finalize()
+for r in dm.connectivity_graph
+    @show r
+end

From 89e8c5bd1c799039e96e247c2baa6ccecef693bf Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 00:43:36 -0500
Subject: [PATCH 025/100] We can MPI.Finalize in a finalizer for
 `DistributedModel`

---
 src/distributed_model.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index d5b4608496..b3a80517af 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -5,7 +5,7 @@ using Oceananigans
 using Oceananigans: BCType
 using Oceananigans.Grids: validate_tupled_argument
 
-struct DistributedModel{A, R, G, C}
+mutable struct DistributedModel{A, R, G, C}
                  ranks :: R
                 models :: A
     connectivity_graph :: G
@@ -105,15 +105,17 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
 
     MPI.Barrier(comm)
 
-    connectivity_graph = MPI.Gather(my_connectivity, 0, comm)
-    
+    connectivity_graph = MPI.Gather([0, 1, 2], 0, comm)
+
     dm = DistributedModel(ranks, nothing, connectivity_graph, comm)
 
+    finalizer(x -> MPI.Finalize(), dm)
+
     return dm
 end
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
 
-for r in dm.connectivity_graph
-    @show r
-end
+# for r in dm.connectivity_graph
+#     @show r
+# end

From 41346ce49c1e3cd3636e074a666952e124879640 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 00:58:14 -0500
Subject: [PATCH 026/100] Gotta be careful and run the rest of the script as
 rank 0

---
 src/distributed_model.jl | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index b3a80517af..cfc50fac61 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -107,15 +107,17 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
 
     connectivity_graph = MPI.Gather([0, 1, 2], 0, comm)
 
-    dm = DistributedModel(ranks, nothing, connectivity_graph, comm)
-
-    finalizer(x -> MPI.Finalize(), dm)
-
-    return dm
+    if my_rank == 0
+        dm = DistributedModel(ranks, nothing, connectivity_graph, comm)
+        finalizer(x -> MPI.Finalize(), dm)
+        return dm
+    end
 end
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
 
-# for r in dm.connectivity_graph
-#     @show r
-# end
+if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+    for r in dm.connectivity_graph
+        @show r
+    end
+end

From 724d2e02f45dfd86920cddd8678c32d349500f75 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 14:47:45 -0500
Subject: [PATCH 027/100] Finalizer could be a bad idea actually

---
 src/distributed_model.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index cfc50fac61..173a81b299 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -108,9 +108,7 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     connectivity_graph = MPI.Gather([0, 1, 2], 0, comm)
 
     if my_rank == 0
-        dm = DistributedModel(ranks, nothing, connectivity_graph, comm)
-        finalizer(x -> MPI.Finalize(), dm)
-        return dm
+        return DistributedModel(ranks, nothing, connectivity_graph, comm)
     end
 end
 

From c45cc4c19939d891fd0839bace3652c795e0bf64 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 15:06:04 -0500
Subject: [PATCH 028/100] Send and received connectivities as named tuples

---
 src/distributed_model.jl | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 173a81b299..28e2357449 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -103,11 +103,16 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
                        north=r_north, south=r_south,
                        top=r_top, bottom=r_bot)
 
-    MPI.Barrier(comm)
+    MPI.send(my_connectivity, 0, my_rank, comm)
 
-    connectivity_graph = MPI.Gather([0, 1, 2], 0, comm)
+    MPI.Barrier(comm)
 
     if my_rank == 0
+        connectivity_graph = Array{RankConnectivity}(undef, mpi_ranks)
+        for r in 0:mpi_ranks-1
+            connectivity_graph[r+1], _ = MPI.recv(r, r, comm)
+        end
+
         return DistributedModel(ranks, nothing, connectivity_graph, comm)
     end
 end

From e594c29b38619fa948f86a2052271163130b21ec Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 16:54:24 -0500
Subject: [PATCH 029/100] No master/root model struct. Every rank stores their
 own model

---
 src/distributed_model.jl | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 28e2357449..0b484c22c8 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -5,11 +5,10 @@ using Oceananigans
 using Oceananigans: BCType
 using Oceananigans.Grids: validate_tupled_argument
 
-mutable struct DistributedModel{A, R, G, C}
+struct DistributedModel{A, R, G}
                  ranks :: R
-                models :: A
-    connectivity_graph :: G
-              MPI_Comm :: C
+                 model :: A
+          connectivity :: G
 end
 
 struct Communication <: BCType end
@@ -40,7 +39,6 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     Rx, Ry, Rz = ranks
     total_ranks = Rx*Ry*Rz
 
-    MPI.Init()
     comm = MPI.COMM_WORLD
 
     mpi_ranks = MPI.Comm_size(comm)
@@ -103,24 +101,20 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
                        north=r_north, south=r_south,
                        top=r_top, bottom=r_bot)
 
-    MPI.send(my_connectivity, 0, my_rank, comm)
-
     MPI.Barrier(comm)
 
-    if my_rank == 0
-        connectivity_graph = Array{RankConnectivity}(undef, mpi_ranks)
-        for r in 0:mpi_ranks-1
-            connectivity_graph[r+1], _ = MPI.recv(r, r, comm)
-        end
+    @info "Rank $my_rank creating my model..."
+    my_model = Model(grid=grid)
+    @info "Rank $my_rank: submodel created!"
 
-        return DistributedModel(ranks, nothing, connectivity_graph, comm)
-    end
+    return DistributedModel(ranks, my_model, my_connectivity)
 end
 
+MPI.Init()
+
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
 
-if MPI.Comm_rank(MPI.COMM_WORLD) == 0
-    for r in dm.connectivity_graph
-        @show r
-    end
-end
+my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+@info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
+
+MPI.Finalize()

From cd03e13fc8adeef22a715c7225e7db57e15651fa Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 17:13:53 -0500
Subject: [PATCH 030/100] Cleanup

---
 src/distributed_model.jl | 87 ++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 35 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 0b484c22c8..3bb166c6b0 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -1,19 +1,14 @@
+using Test
+
 import MPI
 
 using Oceananigans
-
 using Oceananigans: BCType
 using Oceananigans.Grids: validate_tupled_argument
 
-struct DistributedModel{A, R, G}
-                 ranks :: R
-                 model :: A
-          connectivity :: G
-end
-
-struct Communication <: BCType end
-
-const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+#####
+##### Converting between index and MPI rank taking k as the fast index
+#####
 
 @inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
 
@@ -25,6 +20,32 @@ const RankConnectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom
     return i+1, j+1, k+1
 end
 
+#####
+##### Communication boundary condition
+#####
+
+struct Communication <: BCType end
+
+#####
+##### Distributed model struct and constructor
+#####
+
+const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+
+struct DistributedModel{A, R, G}
+                 ranks :: R
+                 model :: A
+          connectivity :: G
+end
+
+"""
+    DistributedModel(size, x, y, z, ranks, model_kwargs...)
+
+size: Number of total grid points.
+x, y, z: Left and right endpoints for each dimension.
+ranks: Number of ranks in each dimension.
+model_kwargs: Passed to `Model` constructor.
+"""
 function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     validate_tupled_argument(ranks, Int, "size")
     validate_tupled_argument(ranks, Int, "ranks")
@@ -44,26 +65,19 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     mpi_ranks = MPI.Comm_size(comm)
     my_rank   = MPI.Comm_rank(comm)
 
-    if my_rank == 0
-        if total_ranks != mpi_ranks
-            throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
-                                "with number of MPI ranks: $mpi_ranks. Exiting with code 1."))
-            MPI.Finalize()
-            exit(code=1)
-        end
+    if total_ranks != mpi_ranks
+        throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
+                            "with number of MPI ranks: $mpi_ranks. Exiting with code 1."))
+        MPI.Finalize()
+        exit(code=1)
     end
 
-    # Ensure that ranks 1:N don't go ahead if total_ranks != mpi_ranks.
-    MPI.Barrier(comm)
+    i, j, k = index = rank2index(my_rank, Rx, Ry, Rz)
 
-    model_id = my_rank + 1
-    index = rank2index(my_rank, Rx, Ry, Rz)
-    rr = index2rank(index..., Rx, Ry, Rz)
-    @info "rank=$my_rank, index=$index, index2rank=$rr"
+    #####
+    ##### Construct local grid
+    #####
 
-    MPI.Barrier(comm)
-
-    i, j, k = rank2index(my_rank, Rx, Ry, Rz)
     nx, ny, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
     lx, ly, lz = Lx/Rx, Ly/Ry, Lz/Rz
 
@@ -71,10 +85,11 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
-    @info "rank=$my_rank, x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
     grid = RegularCartesianGrid(size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
-    MPI.Barrier(comm)
+    #####
+    ##### Construct local connectivity
+    #####
 
     i_east = i+1 > Rx ? nothing : i+1
     i_west = i-1 < 1  ? nothing : i-1
@@ -94,14 +109,12 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     r_top = isnothing(k_top) ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
     r_bot = isnothing(k_bot) ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
-    @info "rank=$my_rank, index=$index, i_east=$i_east, i_west=$i_west, j_north=$j_north, j_south=$j_south, k_top=$k_top, k_bot=$k_bot"
-    @info "rank=$my_rank,                  r_east=$r_east, r_west=$r_west, r_north=$r_north, r_south=$r_south, r_top=$r_top, r_bot=$r_bot"
-
-    my_connectivity = (east=r_east, west=r_west,
-                       north=r_north, south=r_south,
-                       top=r_top, bottom=r_bot)
+    my_connectivity = (east=r_east, west=r_west, north=r_north,
+                       south=r_south, top=r_top, bottom=r_bot)
 
-    MPI.Barrier(comm)
+    #####
+    ##### Construct local model
+    #####
 
     @info "Rank $my_rank creating my model..."
     my_model = Model(grid=grid)
@@ -110,6 +123,10 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     return DistributedModel(ranks, my_model, my_connectivity)
 end
 
+#####
+##### Script/test/whatever
+#####
+
 MPI.Init()
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))

From f37d607bf0926729e39c77f1c1fd8c606fda0d0d Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 17:28:05 -0500
Subject: [PATCH 031/100] Isolate construction of connectivity graph

---
 src/distributed_model.jl | 66 +++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 3bb166c6b0..b5eff1ca6a 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -20,6 +20,38 @@ using Oceananigans.Grids: validate_tupled_argument
     return i+1, j+1, k+1
 end
 
+#####
+##### Connectivity graph
+#####
+
+const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
+
+function construct_connectivity(index, ranks, boundary_conditions)
+    i, j, k = index
+    Rx, Ry, Rz = ranks
+
+    i_east = i+1 > Rx ? nothing : i+1
+    i_west = i-1 < 1  ? nothing : i-1
+
+    j_north = j+1 > Ry ? nothing : j+1
+    j_south = j-1 < 1  ? nothing : j-1
+
+    k_top = k+1 > Rz ? nothing : k+1
+    k_bot = k-1 < 1  ? nothing : k-1
+
+    r_east = isnothing(i_east) ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
+    r_west = isnothing(i_west) ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+
+    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
+    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
+
+    r_top = isnothing(k_top) ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
+    r_bot = isnothing(k_bot) ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+
+    return (east=r_east, west=r_west, north=r_north,
+            south=r_south, top=r_top, bottom=r_bot)
+end
+
 #####
 ##### Communication boundary condition
 #####
@@ -30,8 +62,6 @@ struct Communication <: BCType end
 ##### Distributed model struct and constructor
 #####
 
-const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
-
 struct DistributedModel{A, R, G}
                  ranks :: R
                  model :: A
@@ -46,12 +76,13 @@ x, y, z: Left and right endpoints for each dimension.
 ranks: Number of ranks in each dimension.
 model_kwargs: Passed to `Model` constructor.
 """
-function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
+function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwargs...)
     validate_tupled_argument(ranks, Int, "size")
     validate_tupled_argument(ranks, Int, "ranks")
 
     Nx, Ny, Nz = size
 
+    # Pull out left and right endpoints for full model.
     xL, xR = x
     yL, yR = y
     zL, zR = z
@@ -73,6 +104,7 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     end
 
     i, j, k = index = rank2index(my_rank, Rx, Ry, Rz)
+    @debug "Rank: $my_rank, index: $index"
 
     #####
     ##### Construct local grid
@@ -85,40 +117,22 @@ function DistributedModel(; size, x, y, z, ranks, model_kwargs...)
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
+    @debug "Constructing local grid: n=($nx, $ny, $nz), x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
     grid = RegularCartesianGrid(size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
     #####
     ##### Construct local connectivity
     #####
 
-    i_east = i+1 > Rx ? nothing : i+1
-    i_west = i-1 < 1  ? nothing : i-1
-
-    j_north = j+1 > Ry ? nothing : j+1
-    j_south = j-1 < 1  ? nothing : j-1
-
-    k_top = k+1 > Rz ? nothing : k+1
-    k_bot = k-1 < 1  ? nothing : k-1
-
-    r_east = isnothing(i_east) ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
-    r_west = isnothing(i_west) ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
-
-    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
-    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-
-    r_top = isnothing(k_top) ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
-    r_bot = isnothing(k_bot) ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+    my_connectivity = construct_connectivity(index, ranks, boundary_conditions)
 
-    my_connectivity = (east=r_east, west=r_west, north=r_north,
-                       south=r_south, top=r_top, bottom=r_bot)
+    @debug "Local connectivity: $my_connectivity"
 
     #####
     ##### Construct local model
     #####
 
-    @info "Rank $my_rank creating my model..."
     my_model = Model(grid=grid)
-    @info "Rank $my_rank: submodel created!"
 
     return DistributedModel(ranks, my_model, my_connectivity)
 end
@@ -129,7 +143,9 @@ end
 
 MPI.Init()
 
-dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32), x=(0, 1), y=(-0.5, 0.5), z=(-10, 0))
+dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32),
+                      x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
+                      boundary_conditions=nothing)
 
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"

From 2d8cb4052f6e8862336257026ffc31b141f54b56 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 15 Dec 2019 18:14:41 -0500
Subject: [PATCH 032/100] Account for periodic boundary conditions when
 constructing connectivity

---
 src/distributed_model.jl | 60 +++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index b5eff1ca6a..421c108e0e 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -3,9 +3,15 @@ using Test
 import MPI
 
 using Oceananigans
-using Oceananigans: BCType
+using Oceananigans: BCType, PBC
 using Oceananigans.Grids: validate_tupled_argument
 
+#####
+##### Convinient aliases
+#####
+
+const PeriodicBC = PBC
+
 #####
 ##### Converting between index and MPI rank taking k as the fast index
 #####
@@ -26,27 +32,49 @@ end
 
 const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
 
-function construct_connectivity(index, ranks, boundary_conditions)
-    i, j, k = index
-    Rx, Ry, Rz = ranks
+function increment_index(i, R, bc)
+    if i+1 > R
+        if bc isa PeriodicBC
+            return 1
+        else
+            return nothing
+        end
+    else
+        return i+1
+    end
+end
 
-    i_east = i+1 > Rx ? nothing : i+1
-    i_west = i-1 < 1  ? nothing : i-1
+function decrement_index(i, R, bc)
+    if i-1 < 1
+        if bc isa PeriodicBC
+            return R
+        else
+            return nothing
+        end
+    else
+        return i-1
+    end
+end
 
-    j_north = j+1 > Ry ? nothing : j+1
-    j_south = j-1 < 1  ? nothing : j-1
+function construct_connectivity(index, ranks, bcs)
+    i, j, k = index
+    Rx, Ry, Rz = ranks
 
-    k_top = k+1 > Rz ? nothing : k+1
-    k_bot = k-1 < 1  ? nothing : k-1
+    @show Rx, Ry, Rz
 
-    r_east = isnothing(i_east) ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
-    r_west = isnothing(i_west) ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+    i_east  = increment_index(i, Rx, bcs.x.right)
+    i_west  = decrement_index(i, Rx, bcs.x.left)
+    j_north = increment_index(j, Ry, bcs.y.north)
+    j_south = decrement_index(j, Ry, bcs.y.south)
+    k_top   = increment_index(k, Rz, bcs.z.top)
+    k_bot   = decrement_index(k, Rz, bcs.z.bottom)
 
+    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
+    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
     r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
     r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-
-    r_top = isnothing(k_top) ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
-    r_bot = isnothing(k_bot) ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+    r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
+    r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
     return (east=r_east, west=r_west, north=r_north,
             south=r_south, top=r_top, bottom=r_bot)
@@ -145,7 +173,7 @@ MPI.Init()
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32),
                       x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
-                      boundary_conditions=nothing)
+                      boundary_conditions=HorizontallyPeriodicBCs())
 
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"

From ad2955944ca4ba9dcdd5c5d67da7493a8681000e Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Mon, 16 Dec 2019 05:25:04 -0500
Subject: [PATCH 033/100] Need to know index

---
 src/distributed_model.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 421c108e0e..649f89635f 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -90,7 +90,8 @@ struct Communication <: BCType end
 ##### Distributed model struct and constructor
 #####
 
-struct DistributedModel{A, R, G}
+struct DistributedModel{I, A, R, G}
+                 index :: I
                  ranks :: R
                  model :: A
           connectivity :: G
@@ -153,7 +154,6 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
     #####
 
     my_connectivity = construct_connectivity(index, ranks, boundary_conditions)
-
     @debug "Local connectivity: $my_connectivity"
 
     #####
@@ -162,7 +162,7 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
 
     my_model = Model(grid=grid)
 
-    return DistributedModel(ranks, my_model, my_connectivity)
+    return DistributedModel(index, ranks, my_model, my_connectivity)
 end
 
 #####
@@ -171,7 +171,7 @@ end
 
 MPI.Init()
 
-dm = DistributedModel(ranks=(2, 2, 2), size=(32, 32, 32),
+dm = DistributedModel(ranks=(2, 2, 2), size=(16, 16, 16),
                       x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
                       boundary_conditions=HorizontallyPeriodicBCs())
 

From 54c18601a8d21f8c84a46dbb467c96816b375931 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 3 Jan 2020 12:33:23 -0500
Subject: [PATCH 034/100] Split test

---
 src/distributed_model.jl      | 14 --------------
 src/test_distributed_model.jl | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 14 deletions(-)
 create mode 100644 src/test_distributed_model.jl

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index 649f89635f..de1fe36b31 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -165,17 +165,3 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
     return DistributedModel(index, ranks, my_model, my_connectivity)
 end
 
-#####
-##### Script/test/whatever
-#####
-
-MPI.Init()
-
-dm = DistributedModel(ranks=(2, 2, 2), size=(16, 16, 16),
-                      x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
-                      boundary_conditions=HorizontallyPeriodicBCs())
-
-my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-@info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
-
-MPI.Finalize()
diff --git a/src/test_distributed_model.jl b/src/test_distributed_model.jl
new file mode 100644
index 0000000000..66df271ea7
--- /dev/null
+++ b/src/test_distributed_model.jl
@@ -0,0 +1,16 @@
+using Test
+
+import MPI
+
+include("distributed_model.jl")
+
+MPI.Init()
+
+dm = DistributedModel(ranks=(2, 2, 2), size=(16, 16, 16),
+                      x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
+                      boundary_conditions=HorizontallyPeriodicBCs())
+
+my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+@info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
+
+MPI.Finalize()

From ebbdc910e02169e08b6e770b3fb18b630982bf38 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 3 Jan 2020 15:57:10 -0500
Subject: [PATCH 035/100] Inject halo communication boundary conditions

But need to figure out what to do with pressure solver as it tries
to set up an NNN Poisson solver on some processes...
---
 src/distributed_model.jl      | 64 ++++++++++++++++++++++++++++++-----
 src/test_distributed_model.jl |  3 +-
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/src/distributed_model.jl b/src/distributed_model.jl
index de1fe36b31..46807bff77 100644
--- a/src/distributed_model.jl
+++ b/src/distributed_model.jl
@@ -1,5 +1,3 @@
-using Test
-
 import MPI
 
 using Oceananigans
@@ -60,8 +58,6 @@ function construct_connectivity(index, ranks, bcs)
     i, j, k = index
     Rx, Ry, Rz = ranks
 
-    @show Rx, Ry, Rz
-
     i_east  = increment_index(i, Rx, bcs.x.right)
     i_west  = decrement_index(i, Rx, bcs.x.left)
     j_north = increment_index(j, Ry, bcs.y.north)
@@ -81,10 +77,54 @@ function construct_connectivity(index, ranks, bcs)
 end
 
 #####
-##### Communication boundary condition
+##### Halo communication boundary condition
 #####
 
-struct Communication <: BCType end
+struct HaloCommunication <: BCType end
+const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+
+const HaloCommunicationDetails = NamedTuple{(:rank_from, :rank_to)}
+HaloCommunicationDetails(; rank_from, rank_to) = HaloCommunicationDetails((rank_from, rank_to))
+
+function inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, connectivity)
+    new_field_bcs = []
+
+    for field_bcs in boundary_conditions
+        rank_east = connectivity.east
+        rank_west = connectivity.west
+        rank_north = connectivity.north
+        rank_south = connectivity.south
+        rank_top = connectivity.top
+        rank_bottom = connectivity.bottom
+
+        east_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.east)
+        west_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.west)
+        north_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.north)
+        south_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.south)
+        top_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.top)
+        bottom_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.bottom)
+
+        east_comm_bc = BoundaryCondition(HaloCommunication, east_comm_bc_details)
+        west_comm_bc = BoundaryCondition(HaloCommunication, west_comm_bc_details)
+        north_comm_bc = BoundaryCondition(HaloCommunication, north_comm_bc_details)
+        south_comm_bc = BoundaryCondition(HaloCommunication, south_comm_bc_details)
+        top_comm_bc = BoundaryCondition(HaloCommunication, top_comm_bc_details)
+        bottom_comm_bc = BoundaryCondition(HaloCommunication, bottom_comm_bc_details)
+
+        x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.x.left : west_comm_bc,
+                                             isnothing(rank_east) ? field_bcs.x.right : east_comm_bc)
+
+        y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.y.south : south_comm_bc,
+                                             isnothing(rank_north) ? field_bcs.y.north : north_comm_bc)
+
+        z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.z.bottom : bottom_comm_bc,
+                                             isnothing(rank_top) ? field_bcs.z.top : top_comm_bc)
+
+        push!(new_field_bcs, FieldBoundaryConditions(x_bcs, y_bcs, z_bcs))
+    end
+
+    return NamedTuple{propertynames(boundary_conditions)}(Tuple(new_field_bcs))
+end
 
 #####
 ##### Distributed model struct and constructor
@@ -153,15 +193,21 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
     ##### Construct local connectivity
     #####
 
-    my_connectivity = construct_connectivity(index, ranks, boundary_conditions)
+    my_connectivity = construct_connectivity(index, ranks, boundary_conditions.u)
     @debug "Local connectivity: $my_connectivity"
 
+    #####
+    ##### Change appropriate boundary conditions to halo communication BCs
+    #####
+
+    @debug "Injecting halo communication boundary conditions..."
+    boundary_conditions_with_communication = inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, my_connectivity)
+
     #####
     ##### Construct local model
     #####
 
-    my_model = Model(grid=grid)
+    my_model = Model(grid=grid, boundary_conditions=boundary_conditions_with_communication)
 
     return DistributedModel(index, ranks, my_model, my_connectivity)
 end
-
diff --git a/src/test_distributed_model.jl b/src/test_distributed_model.jl
index 66df271ea7..680b546319 100644
--- a/src/test_distributed_model.jl
+++ b/src/test_distributed_model.jl
@@ -8,9 +8,10 @@ MPI.Init()
 
 dm = DistributedModel(ranks=(2, 2, 2), size=(16, 16, 16),
                       x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
-                      boundary_conditions=HorizontallyPeriodicBCs())
+                      boundary_conditions=HorizontallyPeriodicSolutionBCs())
 
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
+@info "u.x BCs: $(dm.model.boundary_conditions.solution.u.x)"
 
 MPI.Finalize()

From bd7fd4ce8f5fb7a3974fdd9ef745bc0ad93c9f09 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 08:58:27 -0500
Subject: [PATCH 036/100] Move to an `Oceananigans.Distributed` submodule.

---
 src/{ => Distributed}/distributed_model.jl      | 0
 src/{ => Distributed}/test_distributed_model.jl | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/{ => Distributed}/distributed_model.jl (100%)
 rename src/{ => Distributed}/test_distributed_model.jl (100%)

diff --git a/src/distributed_model.jl b/src/Distributed/distributed_model.jl
similarity index 100%
rename from src/distributed_model.jl
rename to src/Distributed/distributed_model.jl
diff --git a/src/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
similarity index 100%
rename from src/test_distributed_model.jl
rename to src/Distributed/test_distributed_model.jl

From e99f78f3b64edac3dbef8e4b3d4c9c27a3da24e6 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 09:20:39 -0500
Subject: [PATCH 037/100] Properly pass `Model` kwargs.

---
 src/Distributed/distributed_model.jl      | 3 ++-
 src/Distributed/test_distributed_model.jl | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 46807bff77..1be3e33e96 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -207,7 +207,8 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
     ##### Construct local model
     #####
 
-    my_model = Model(grid=grid, boundary_conditions=boundary_conditions_with_communication)
+    my_model = Model(; grid = grid, boundary_conditions = boundary_conditions_with_communication,
+                     model_kwargs...)
 
     return DistributedModel(index, ranks, my_model, my_connectivity)
 end
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 680b546319..2a3b932457 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -6,9 +6,10 @@ include("distributed_model.jl")
 
 MPI.Init()
 
-dm = DistributedModel(ranks=(2, 2, 2), size=(16, 16, 16),
-                      x=(0, 1), y=(-0.5, 0.5), z=(-10, 0),
-                      boundary_conditions=HorizontallyPeriodicSolutionBCs())
+dm = DistributedModel(ranks = (2, 2, 2), size = (16, 16, 16),
+                      x = (0, 1), y = (-0.5, 0.5), z = (-10, 0),
+                      boundary_conditions = HorizontallyPeriodicSolutionBCs(),
+                      poisson_solver = nothing)
 
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"

From db394c05d4e6add4661c813e6cfc454af9d1e906 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 10:09:36 -0500
Subject: [PATCH 038/100] No need to `MPI.Finalize()` anymore

---
 src/Distributed/test_distributed_model.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 2a3b932457..0d9a807a89 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -14,5 +14,3 @@ dm = DistributedModel(ranks = (2, 2, 2), size = (16, 16, 16),
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
 @info "u.x BCs: $(dm.model.boundary_conditions.solution.u.x)"
-
-MPI.Finalize()

From cabf31b8123febfc029496620986a6967a9d1a3f Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 11:00:32 -0500
Subject: [PATCH 039/100] =?UTF-8?q?Multiple=20dispatch=20to=20fill=20west?=
 =?UTF-8?q?=20halo=20with=20MPI=20=F0=9F=98=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Distributed/distributed_model.jl      | 33 +++++++++++++++++++++++
 src/Distributed/test_distributed_model.jl |  9 +++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 1be3e33e96..119f550c6c 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -4,6 +4,8 @@ using Oceananigans
 using Oceananigans: BCType, PBC
 using Oceananigans.Grids: validate_tupled_argument
 
+import Oceananigans: fill_west_halo!
+
 #####
 ##### Convinient aliases
 #####
@@ -126,6 +128,37 @@ function inject_halo_communication_boundary_conditions(boundary_conditions, my_r
     return NamedTuple{propertynames(boundary_conditions)}(Tuple(new_field_bcs))
 end
 
+# Note: Hard-coded so this only works up to 10^3 ranks.
+@inline halo_comm_bc_send_tag(bc) = 10^3 * bc.condition.rank_from + bc.condition.rank_to
+@inline halo_comm_bc_recv_tag(bc) = 10^3 * bc.condition.rank_to   + bc.condition.rank_from
+
+function fill_west_halo!(c, bc::HaloCommunicationBC, arch, grid, args...)
+    N, H = grid.Nx, grid.Hx
+
+    send_buffer = c.parent[N+1:N+H, :, :]
+    recv_buffer = c.parent[1:H, :, :]
+
+    dest_rank = bc.condition.rank_to
+     src_rank = bc.condition.rank_from
+
+    send_tag = halo_comm_bc_send_tag(bc)
+    recv_tag = halo_comm_bc_recv_tag(bc)
+
+    # @info "MPI.Isend: dest_rank=$dest_rank, send_tag=$send_tag"
+    # MPI.Isend(send_buffer, dest_rank, send_tag, MPI.COMM_WORLD)
+    # @info "MPI.Isend: done!"
+    #
+    # @info "MPI.Recv!: src_rank=$src_rank, recv_tag=$recv_tag"
+    # MPI.Recv!(recv_buffer, dest_rank, recv_tag, MPI.COMM_WORLD)
+    # @info "MPI.Recv! done!"
+
+    @info "Sendrecv! src_rank=$src_rank, dest_rank=$dest_rank, send_tag=$send_tag, recv_tag=$recv_tag"
+    MPI.Sendrecv!(send_buffer, dest_rank, send_tag,
+                  recv_buffer, dest_rank, recv_tag,
+                  MPI.COMM_WORLD)
+    @info "Sendrecv! done!"
+end
+
 #####
 ##### Distributed model struct and constructor
 #####
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 0d9a807a89..4bafebece9 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -2,6 +2,8 @@ using Test
 
 import MPI
 
+using Oceananigans: interior, fill_halo_regions!
+
 include("distributed_model.jl")
 
 MPI.Init()
@@ -12,5 +14,8 @@ dm = DistributedModel(ranks = (2, 2, 2), size = (16, 16, 16),
                       poisson_solver = nothing)
 
 my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-@info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
-@info "u.x BCs: $(dm.model.boundary_conditions.solution.u.x)"
+# @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
+# @info "u.x BCs: $(dm.model.boundary_conditions.solution.u.x)"
+
+interior(dm.model.velocities.u) .= rand(8, 8, 8)
+fill_halo_regions!(dm.model.velocities.u.data, dm.model.boundary_conditions.solution.u, CPU(), dm.model.grid)

From 35bb6745c5f1b909c43e247750daaf32aed8c743 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 11:50:31 -0500
Subject: [PATCH 040/100] Actually fill in the west halo. Also start to
 generalize.

---
 src/Distributed/distributed_model.jl      | 46 ++++++++++++-----------
 src/Distributed/test_distributed_model.jl |  2 +
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 119f550c6c..0c862240b7 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -128,35 +128,37 @@ function inject_halo_communication_boundary_conditions(boundary_conditions, my_r
     return NamedTuple{propertynames(boundary_conditions)}(Tuple(new_field_bcs))
 end
 
-# Note: Hard-coded so this only works up to 10^3 ranks.
-@inline halo_comm_bc_send_tag(bc) = 10^3 * bc.condition.rank_from + bc.condition.rank_to
-@inline halo_comm_bc_recv_tag(bc) = 10^3 * bc.condition.rank_to   + bc.condition.rank_from
+# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
+const MAX_RANKS = 10^3
 
-function fill_west_halo!(c, bc::HaloCommunicationBC, arch, grid, args...)
-    N, H = grid.Nx, grid.Hx
+sides  = (:west, :east, :south, :north, :top, :bottom)
+coords = (:x,    :x,    :y,     :y,     :z,   :z)
+
+@inline west_halo_comm_bc_send_tag(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)
+@inline west_halo_comm_bc_recv_tag(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from)
+
+@inline west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
 
-    send_buffer = c.parent[N+1:N+H, :, :]
-    recv_buffer = c.parent[1:H, :, :]
+@inline west_recv_buffer(grid) = zeros(grid.Hx, grid.Ty, grid.Tz)
 
-    dest_rank = bc.condition.rank_to
-     src_rank = bc.condition.rank_from
+const east_recv_buffer = west_recv_buffer
 
-    send_tag = halo_comm_bc_send_tag(bc)
-    recv_tag = halo_comm_bc_recv_tag(bc)
+function fill_west_halo!(c, bc::HaloCommunicationBC, arch, grid, args...)
+    send_buffer = west_send_buffer(c, grid.Nx, grid.Hx)
+    recv_buffer = west_recv_buffer(grid)
+
+    send_tag = west_halo_comm_bc_send_tag(bc)
+    recv_tag = west_halo_comm_bc_recv_tag(bc)
 
-    # @info "MPI.Isend: dest_rank=$dest_rank, send_tag=$send_tag"
-    # MPI.Isend(send_buffer, dest_rank, send_tag, MPI.COMM_WORLD)
-    # @info "MPI.Isend: done!"
-    #
-    # @info "MPI.Recv!: src_rank=$src_rank, recv_tag=$recv_tag"
-    # MPI.Recv!(recv_buffer, dest_rank, recv_tag, MPI.COMM_WORLD)
-    # @info "MPI.Recv! done!"
+    rank_send_to = rank_recv_from = bc.condition.rank_to
 
-    @info "Sendrecv! src_rank=$src_rank, dest_rank=$dest_rank, send_tag=$send_tag, recv_tag=$recv_tag"
-    MPI.Sendrecv!(send_buffer, dest_rank, send_tag,
-                  recv_buffer, dest_rank, recv_tag,
+    @info "Sendrecv!: rank_send_to=rank_recv_from=$rank_send_to, send_tag=$send_tag, recv_tag=$recv_tag"
+    MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+                  recv_buffer, rank_recv_from, recv_tag,
                   MPI.COMM_WORLD)
-    @info "Sendrecv! done!"
+    @info "Sendrecv!: done!"
+
+    c.parent[1:grid.Hx, :, :] .= recv_buffer
 end
 
 #####
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 4bafebece9..9790dbd705 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -19,3 +19,5 @@ my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
 interior(dm.model.velocities.u) .= rand(8, 8, 8)
 fill_halo_regions!(dm.model.velocities.u.data, dm.model.boundary_conditions.solution.u, CPU(), dm.model.grid)
+
+display(interior(dm.model.velocities.u))

From 45666369d3ea4f5c864d8d662a5f445b22f3587a Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 12:39:52 -0500
Subject: [PATCH 041/100] Oceananigans.Distributed: use macros to fill each
 halo.

---
 src/Distributed/distributed_model.jl | 87 +++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 0c862240b7..2b54a591f8 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -4,7 +4,9 @@ using Oceananigans
 using Oceananigans: BCType, PBC
 using Oceananigans.Grids: validate_tupled_argument
 
-import Oceananigans: fill_west_halo!
+import Oceananigans: fill_west_halo!, fill_east_halo!,
+                     fill_south_halo!, fill_north_halo!,
+                     fill_bottom_halo!, fill_top_halo!
 
 #####
 ##### Convinient aliases
@@ -128,37 +130,82 @@ function inject_halo_communication_boundary_conditions(boundary_conditions, my_r
     return NamedTuple{propertynames(boundary_conditions)}(Tuple(new_field_bcs))
 end
 
-# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-const MAX_RANKS = 10^3
+#####
+##### Filling halos for halo communication boundary conditions
+#####
 
 sides  = (:west, :east, :south, :north, :top, :bottom)
 coords = (:x,    :x,    :y,     :y,     :z,   :z)
 
-@inline west_halo_comm_bc_send_tag(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)
-@inline west_halo_comm_bc_recv_tag(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from)
+# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
+const MAX_RANKS = 10^3
+
+# Define functions that return unique send and recv MPI tags for each side.
+for (i, side) in enumerate(sides)
+    send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
+    recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
+    @eval begin
+        @inline $send_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)   + $i
+        @inline $recv_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from) + $i
+    end
+end
+
+@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
+@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
+@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
+@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
+@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
+@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
+
+@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ty, grid.Tz)
+@inline south_recv_buffer(grid) = zeros(grid.Tx, grid.Hy, grid.Tz)
+@inline top_recv_buffer(grid)   = zeros(grid.Tx, grid.Ty, grid.Hz)
+
+const   east_recv_buffer =  west_recv_buffer
+const  north_recv_buffer = south_recv_buffer
+const bottom_recv_buffer =   top_recv_buffer
 
-@inline west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
+@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
+@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
+@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
+@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
+@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
+@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
 
-@inline west_recv_buffer(grid) = zeros(grid.Hx, grid.Ty, grid.Tz)
+for (x, side) in zip(coords, sides)
+    H = Symbol(:H, x)
+    N = Symbol(:N, x)
 
-const east_recv_buffer = west_recv_buffer
+    fill_fn_name     = Symbol(:fill_, side, :_halo!)
+    send_buf_fn_name = Symbol(side, :_send_buffer)
+    recv_buf_fn_name = Symbol(side, :_recv_buffer)
+    send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
+    recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
+    copy_buf_fn_name = Symbol(:copy_recv_buffer_into_, side, :_halo!)
 
-function fill_west_halo!(c, bc::HaloCommunicationBC, arch, grid, args...)
-    send_buffer = west_send_buffer(c, grid.Nx, grid.Hx)
-    recv_buffer = west_recv_buffer(grid)
+    @eval begin
+        function $fill_fn_name(c, bc::HaloCommunicationBC, arch, grid, args...)
+            send_buffer = $send_buf_fn_name(c, grid.$(N), grid.$(H))
+            recv_buffer = $recv_buf_fn_name(grid)
 
-    send_tag = west_halo_comm_bc_send_tag(bc)
-    recv_tag = west_halo_comm_bc_recv_tag(bc)
+            send_tag = $send_tag_fn_name(bc)
+            recv_tag = $recv_tag_fn_name(bc)
 
-    rank_send_to = rank_recv_from = bc.condition.rank_to
+            my_rank = bc.condition.rank_from
+            rank_send_to = rank_recv_from = bc.condition.rank_to
 
-    @info "Sendrecv!: rank_send_to=rank_recv_from=$rank_send_to, send_tag=$send_tag, recv_tag=$recv_tag"
-    MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-                  recv_buffer, rank_recv_from, recv_tag,
-                  MPI.COMM_WORLD)
-    @info "Sendrecv!: done!"
+            @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
+                  "send_tag=$send_tag, recv_tag=$recv_tag"
 
-    c.parent[1:grid.Hx, :, :] .= recv_buffer
+            MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+                          recv_buffer, rank_recv_from, recv_tag,
+                          MPI.COMM_WORLD)
+
+            @info "Sendrecv!: my_rank=$my_rank done!"
+
+            $copy_buf_fn_name(c, grid.$(N), grid.$(H), recv_buffer)
+        end
+    end
 end
 
 #####

From 227f2be16dab1635d5266d257e4ea81c4b925a83 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sun, 5 Jan 2020 13:16:59 -0500
Subject: [PATCH 042/100] Hmmm, halo communication seems to deadlock :(

---
 src/Distributed/distributed_model.jl | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 2b54a591f8..c1b46c81ae 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -194,14 +194,22 @@ for (x, side) in zip(coords, sides)
             my_rank = bc.condition.rank_from
             rank_send_to = rank_recv_from = bc.condition.rank_to
 
-            @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
-                  "send_tag=$send_tag, recv_tag=$recv_tag"
-
-            MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-                          recv_buffer, rank_recv_from, recv_tag,
-                          MPI.COMM_WORLD)
-
-            @info "Sendrecv!: my_rank=$my_rank done!"
+            @info "MPI.Isend: my_rank=$my_rank, rank_send_to=$rank_send_to, send_tag=$send_tag"
+            MPI.Isend(send_buffer, rank_send_to, send_tag, MPI.COMM_WORLD)
+            @info "MPI.Isend: done!"
+
+            @info "MPI.Recv!: my_rank=$my_rank, rank_recv_from=$rank_recv_from, recv_tag=$recv_tag"
+            MPI.Recv!(recv_buffer, rank_recv_from, recv_tag, MPI.COMM_WORLD)
+            @info "MPI.Recv! done!"
+
+            # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
+            #       "send_tag=$send_tag, recv_tag=$recv_tag"
+            #
+            # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+            #               recv_buffer, rank_recv_from, recv_tag,
+            #               MPI.COMM_WORLD)
+            #
+            # @info "Sendrecv!: my_rank=$my_rank done!"
 
             $copy_buf_fn_name(c, grid.$(N), grid.$(H), recv_buffer)
         end

From 5cdeac3566fbef799912ba2e9a87ed379292e406 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 09:54:07 -0500
Subject: [PATCH 043/100] Modernize `distributed_model.jl`

---
 src/Distributed/distributed_model.jl | 335 ++++++++++++++-------------
 1 file changed, 176 insertions(+), 159 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index c1b46c81ae..777571bf9b 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -1,18 +1,14 @@
 import MPI
 
 using Oceananigans
-using Oceananigans: BCType, PBC
-using Oceananigans.Grids: validate_tupled_argument
-
-import Oceananigans: fill_west_halo!, fill_east_halo!,
-                     fill_south_halo!, fill_north_halo!,
-                     fill_bottom_halo!, fill_top_halo!
+using Oceananigans.Grids
 
-#####
-##### Convinient aliases
-#####
+using Oceananigans.Grids: validate_tupled_argument
+using Oceananigans.BoundaryConditions: BCType
 
-const PeriodicBC = PBC
+import Oceananigans.BoundaryConditions:
+    fill_west_halo!, fill_east_halo!, fill_south_halo!,
+    fill_north_halo!, fill_bottom_halo!, fill_top_halo!
 
 #####
 ##### Converting between index and MPI rank taking k as the fast index
@@ -29,14 +25,27 @@ const PeriodicBC = PBC
 end
 
 #####
-##### Connectivity graph
+##### Rank connectivity graph
 #####
 
 const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
 
-function increment_index(i, R, bc)
+struct RankConnectivity{E, W, N, S, T, B}
+      east :: E
+      west :: W
+     north :: N
+     south :: S
+       top :: T
+    bottom :: B
+end
+
+RankConnectivity(; east, west, north, south, top, bottom) =
+    RankConnectivity(east, west, north, south, top, bottom)
+
+function increment_index(i, R, topo)
+    R == 1 && return nothing
     if i+1 > R
-        if bc isa PeriodicBC
+        if topo == Periodic
             return 1
         else
             return nothing
@@ -46,9 +55,10 @@ function increment_index(i, R, bc)
     end
 end
 
-function decrement_index(i, R, bc)
+function decrement_index(i, R, topo)
+    R == 1 && return nothing
     if i-1 < 1
-        if bc isa PeriodicBC
+        if topo == Periodic
             return R
         else
             return nothing
@@ -58,16 +68,17 @@ function decrement_index(i, R, bc)
     end
 end
 
-function construct_connectivity(index, ranks, bcs)
-    i, j, k = index
+function RankConnectivity(model_index, ranks, topology)
+    i, j, k = model_index
     Rx, Ry, Rz = ranks
+    TX, TY, TZ = topology
 
-    i_east  = increment_index(i, Rx, bcs.x.right)
-    i_west  = decrement_index(i, Rx, bcs.x.left)
-    j_north = increment_index(j, Ry, bcs.y.north)
-    j_south = decrement_index(j, Ry, bcs.y.south)
-    k_top   = increment_index(k, Rz, bcs.z.top)
-    k_bot   = decrement_index(k, Rz, bcs.z.bottom)
+    i_east  = increment_index(i, Rx, TX)
+    i_west  = decrement_index(i, Rx, TX)
+    j_north = increment_index(j, Ry, TY)
+    j_south = decrement_index(j, Ry, TY)
+    k_top   = increment_index(k, Rz, TZ)
+    k_bot   = decrement_index(k, Rz, TZ)
 
     r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
     r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
@@ -76,8 +87,8 @@ function construct_connectivity(index, ranks, bcs)
     r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
     r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
-    return (east=r_east, west=r_west, north=r_north,
-            south=r_south, top=r_top, bottom=r_bot)
+    return RankConnectivity(east=r_east, west=r_west, north=r_north,
+                            south=r_south, top=r_top, bottom=r_bot)
 end
 
 #####
@@ -85,136 +96,139 @@ end
 #####
 
 struct HaloCommunication <: BCType end
+
 const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
 
-const HaloCommunicationDetails = NamedTuple{(:rank_from, :rank_to)}
-HaloCommunicationDetails(; rank_from, rank_to) = HaloCommunicationDetails((rank_from, rank_to))
+HaloCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(HaloCommunication, val; kwargs...)
 
-function inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, connectivity)
-    new_field_bcs = []
+struct HaloCommunicationRanks{T}
+    from :: T
+      to :: T
+end
 
-    for field_bcs in boundary_conditions
-        rank_east = connectivity.east
-        rank_west = connectivity.west
-        rank_north = connectivity.north
-        rank_south = connectivity.south
-        rank_top = connectivity.top
-        rank_bottom = connectivity.bottom
-
-        east_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.east)
-        west_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.west)
-        north_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.north)
-        south_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.south)
-        top_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.top)
-        bottom_comm_bc_details = HaloCommunicationDetails(rank_from=my_rank, rank_to=connectivity.bottom)
-
-        east_comm_bc = BoundaryCondition(HaloCommunication, east_comm_bc_details)
-        west_comm_bc = BoundaryCondition(HaloCommunication, west_comm_bc_details)
-        north_comm_bc = BoundaryCondition(HaloCommunication, north_comm_bc_details)
-        south_comm_bc = BoundaryCondition(HaloCommunication, south_comm_bc_details)
-        top_comm_bc = BoundaryCondition(HaloCommunication, top_comm_bc_details)
-        bottom_comm_bc = BoundaryCondition(HaloCommunication, bottom_comm_bc_details)
-
-        x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.x.left : west_comm_bc,
-                                             isnothing(rank_east) ? field_bcs.x.right : east_comm_bc)
-
-        y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.y.south : south_comm_bc,
-                                             isnothing(rank_north) ? field_bcs.y.north : north_comm_bc)
-
-        z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.z.bottom : bottom_comm_bc,
-                                             isnothing(rank_top) ? field_bcs.z.top : top_comm_bc)
-
-        push!(new_field_bcs, FieldBoundaryConditions(x_bcs, y_bcs, z_bcs))
-    end
+HaloCommunicationRanks(; from, to) = HaloCommunicationRanks(from, to)
 
-    return NamedTuple{propertynames(boundary_conditions)}(Tuple(new_field_bcs))
+function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
+    new_field_bcs = []
+
+    rank_east = connectivity.east
+    rank_west = connectivity.west
+    rank_north = connectivity.north
+    rank_south = connectivity.south
+    rank_top = connectivity.top
+    rank_bottom = connectivity.bottom
+
+    east_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_east)
+    west_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_west)
+    north_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_north)
+    south_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_south)
+    top_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_top)
+    bottom_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_bottom)
+
+    east_comm_bc = HaloCommunicationBoundaryCondition(east_comm_ranks)
+    west_comm_bc = HaloCommunicationBoundaryCondition(west_comm_ranks)
+    north_comm_bc = HaloCommunicationBoundaryCondition(north_comm_ranks)
+    south_comm_bc = HaloCommunicationBoundaryCondition(south_comm_ranks)
+    top_comm_bc = HaloCommunicationBoundaryCondition(top_comm_ranks)
+    bottom_comm_bc = HaloCommunicationBoundaryCondition(bottom_comm_ranks)
+
+    x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.x.left : west_comm_bc,
+                                         isnothing(rank_east) ? field_bcs.x.right : east_comm_bc)
+
+    y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.y.south : south_comm_bc,
+                                         isnothing(rank_north) ? field_bcs.y.north : north_comm_bc)
+
+    z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.z.bottom : bottom_comm_bc,
+                                         isnothing(rank_top) ? field_bcs.z.top : top_comm_bc)
+
+    return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
 end
 
 #####
 ##### Filling halos for halo communication boundary conditions
 #####
 
-sides  = (:west, :east, :south, :north, :top, :bottom)
-coords = (:x,    :x,    :y,     :y,     :z,   :z)
-
-# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-const MAX_RANKS = 10^3
-
-# Define functions that return unique send and recv MPI tags for each side.
-for (i, side) in enumerate(sides)
-    send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
-    recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
-    @eval begin
-        @inline $send_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)   + $i
-        @inline $recv_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from) + $i
-    end
-end
-
-@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
-@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
-@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
-@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
-@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
-@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
-
-@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ty, grid.Tz)
-@inline south_recv_buffer(grid) = zeros(grid.Tx, grid.Hy, grid.Tz)
-@inline top_recv_buffer(grid)   = zeros(grid.Tx, grid.Ty, grid.Hz)
-
-const   east_recv_buffer =  west_recv_buffer
-const  north_recv_buffer = south_recv_buffer
-const bottom_recv_buffer =   top_recv_buffer
-
-@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
-@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
-@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
-@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
-@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
-@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
-
-for (x, side) in zip(coords, sides)
-    H = Symbol(:H, x)
-    N = Symbol(:N, x)
-
-    fill_fn_name     = Symbol(:fill_, side, :_halo!)
-    send_buf_fn_name = Symbol(side, :_send_buffer)
-    recv_buf_fn_name = Symbol(side, :_recv_buffer)
-    send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
-    recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
-    copy_buf_fn_name = Symbol(:copy_recv_buffer_into_, side, :_halo!)
-
-    @eval begin
-        function $fill_fn_name(c, bc::HaloCommunicationBC, arch, grid, args...)
-            send_buffer = $send_buf_fn_name(c, grid.$(N), grid.$(H))
-            recv_buffer = $recv_buf_fn_name(grid)
-
-            send_tag = $send_tag_fn_name(bc)
-            recv_tag = $recv_tag_fn_name(bc)
-
-            my_rank = bc.condition.rank_from
-            rank_send_to = rank_recv_from = bc.condition.rank_to
-
-            @info "MPI.Isend: my_rank=$my_rank, rank_send_to=$rank_send_to, send_tag=$send_tag"
-            MPI.Isend(send_buffer, rank_send_to, send_tag, MPI.COMM_WORLD)
-            @info "MPI.Isend: done!"
-
-            @info "MPI.Recv!: my_rank=$my_rank, rank_recv_from=$rank_recv_from, recv_tag=$recv_tag"
-            MPI.Recv!(recv_buffer, rank_recv_from, recv_tag, MPI.COMM_WORLD)
-            @info "MPI.Recv! done!"
-
-            # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
-            #       "send_tag=$send_tag, recv_tag=$recv_tag"
-            #
-            # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-            #               recv_buffer, rank_recv_from, recv_tag,
-            #               MPI.COMM_WORLD)
-            #
-            # @info "Sendrecv!: my_rank=$my_rank done!"
-
-            $copy_buf_fn_name(c, grid.$(N), grid.$(H), recv_buffer)
-        end
-    end
-end
+# sides  = (:west, :east, :south, :north, :top, :bottom)
+# coords = (:x,    :x,    :y,     :y,     :z,   :z)
+
+# # Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
+# const MAX_RANKS = 10^3
+
+# # Define functions that return unique send and recv MPI tags for each side.
+# for (i, side) in enumerate(sides)
+#     send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
+#     recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
+#     @eval begin
+#         @inline $send_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)   + $i
+#         @inline $recv_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from) + $i
+#     end
+# end
+
+# @inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
+# @inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
+# @inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
+# @inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
+# @inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
+# @inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
+
+# @inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ty, grid.Tz)
+# @inline south_recv_buffer(grid) = zeros(grid.Tx, grid.Hy, grid.Tz)
+# @inline top_recv_buffer(grid)   = zeros(grid.Tx, grid.Ty, grid.Hz)
+
+# const   east_recv_buffer =  west_recv_buffer
+# const  north_recv_buffer = south_recv_buffer
+# const bottom_recv_buffer =   top_recv_buffer
+
+# @inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
+# @inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
+# @inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
+# @inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
+# @inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
+# @inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
+
+# for (x, side) in zip(coords, sides)
+#     H = Symbol(:H, x)
+#     N = Symbol(:N, x)
+
+#     fill_fn_name     = Symbol(:fill_, side, :_halo!)
+#     send_buf_fn_name = Symbol(side, :_send_buffer)
+#     recv_buf_fn_name = Symbol(side, :_recv_buffer)
+#     send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
+#     recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
+#     copy_buf_fn_name = Symbol(:copy_recv_buffer_into_, side, :_halo!)
+
+#     @eval begin
+#         function $fill_fn_name(c, bc::HaloCommunicationBC, arch, grid, args...)
+#             send_buffer = $send_buf_fn_name(c, grid.$(N), grid.$(H))
+#             recv_buffer = $recv_buf_fn_name(grid)
+
+#             send_tag = $send_tag_fn_name(bc)
+#             recv_tag = $recv_tag_fn_name(bc)
+
+#             my_rank = bc.condition.rank_from
+#             rank_send_to = rank_recv_from = bc.condition.rank_to
+
+#             @info "MPI.Isend: my_rank=$my_rank, rank_send_to=$rank_send_to, send_tag=$send_tag"
+#             MPI.Isend(send_buffer, rank_send_to, send_tag, MPI.COMM_WORLD)
+#             @info "MPI.Isend: done!"
+
+#             @info "MPI.Recv!: my_rank=$my_rank, rank_recv_from=$rank_recv_from, recv_tag=$recv_tag"
+#             MPI.Recv!(recv_buffer, rank_recv_from, recv_tag, MPI.COMM_WORLD)
+#             @info "MPI.Recv! done!"
+
+#             # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
+#             #       "send_tag=$send_tag, recv_tag=$recv_tag"
+#             #
+#             # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+#             #               recv_buffer, rank_recv_from, recv_tag,
+#             #               MPI.COMM_WORLD)
+#             #
+#             # @info "Sendrecv!: my_rank=$my_rank done!"
+
+#             $copy_buf_fn_name(c, grid.$(N), grid.$(H), recv_buffer)
+#         end
+#     end
+# end
 
 #####
 ##### Distributed model struct and constructor
@@ -235,17 +249,16 @@ x, y, z: Left and right endpoints for each dimension.
 ranks: Number of ranks in each dimension.
 model_kwargs: Passed to `Model` constructor.
 """
-function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwargs...)
-    validate_tupled_argument(ranks, Int, "size")
+function DistributedModel(; grid, ranks, model_kwargs...)
     validate_tupled_argument(ranks, Int, "ranks")
 
-    Nx, Ny, Nz = size
+    Nx, Ny, Nz = size(grid)
 
     # Pull out left and right endpoints for full model.
-    xL, xR = x
-    yL, yR = y
-    zL, zR = z
-    Lx, Ly, Lz = xR-xL, yR-yL, zR-zL
+    xL, xR = grid.xF[1], grid.xF[Nx+1]
+    yL, yR = grid.yF[1], grid.yF[Ny+1]
+    zL, zR = grid.zF[1], grid.zF[Nz+1]
+    Lx, Ly, Lz = length(grid)
 
     Rx, Ry, Rz = ranks
     total_ranks = Rx*Ry*Rz
@@ -257,18 +270,23 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
 
     if total_ranks != mpi_ranks
         throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
-                            "with number of MPI ranks: $mpi_ranks. Exiting with code 1."))
+                            "with number of MPI ranks: $mpi_ranks. Exiting with return code 1."))
         MPI.Finalize()
         exit(code=1)
     end
 
     i, j, k = index = rank2index(my_rank, Rx, Ry, Rz)
-    @debug "Rank: $my_rank, index: $index"
+    @debug "My rank: $my_rank, my index: $index"
 
     #####
     ##### Construct local grid
     #####
 
+    # Make sure we can put an integer number of grid points in each rank.
+    @assert isinteger(Nx / Rx)
+    @assert isinteger(Ny / Ry)
+    @assert isinteger(Nz / Rz)
+
     nx, ny, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
     lx, ly, lz = Lx/Rx, Ly/Ry, Lz/Rz
 
@@ -277,28 +295,27 @@ function DistributedModel(; size, x, y, z, ranks, boundary_conditions, model_kwa
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
     @debug "Constructing local grid: n=($nx, $ny, $nz), x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
-    grid = RegularCartesianGrid(size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
+    my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
     #####
     ##### Construct local connectivity
     #####
 
-    my_connectivity = construct_connectivity(index, ranks, boundary_conditions.u)
+    my_connectivity = RankConnectivity(index, ranks, topology(grid))
     @debug "Local connectivity: $my_connectivity"
 
     #####
     ##### Change appropriate boundary conditions to halo communication BCs
     #####
 
-    @debug "Injecting halo communication boundary conditions..."
-    boundary_conditions_with_communication = inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, my_connectivity)
+    # @debug "Injecting halo communication boundary conditions..."
+    # boundary_conditions_with_communication = inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, my_connectivity)
 
     #####
     ##### Construct local model
     #####
 
-    my_model = Model(; grid = grid, boundary_conditions = boundary_conditions_with_communication,
-                     model_kwargs...)
+    my_model = IncompressibleModel(; grid=my_grid, model_kwargs...)
 
     return DistributedModel(index, ranks, my_model, my_connectivity)
 end

From c84a958e708051acd32536a85712e90ab2deae2d Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 09:54:29 -0500
Subject: [PATCH 044/100] Some basic tests for slab decomposition models

---
 src/Distributed/test_distributed_model.jl | 145 +++++++++++++++++++---
 1 file changed, 131 insertions(+), 14 deletions(-)

diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 9790dbd705..0e08689a9e 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -1,23 +1,140 @@
 using Test
+using MPI
+using Oceananigans
 
-import MPI
+MPI.Initialized() || MPI.Init()
+comm = MPI.COMM_WORLD
 
-using Oceananigans: interior, fill_halo_regions!
+# Right now just testing with 4 ranks!
+mpi_ranks = MPI.Comm_size(comm)
+@assert mpi_ranks == 4
 
-include("distributed_model.jl")
+function test_triply_periodic_connectivity_with_411_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    dm = DistributedModel(grid=full_grid, ranks=(4, 1, 1))
 
-MPI.Init()
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test my_rank == index2rank(dm.index..., dm.ranks...)
 
-dm = DistributedModel(ranks = (2, 2, 2), size = (16, 16, 16),
-                      x = (0, 1), y = (-0.5, 0.5), z = (-10, 0),
-                      boundary_conditions = HorizontallyPeriodicSolutionBCs(),
-                      poisson_solver = nothing)
+    model = dm.model
+    connectivity = dm.connectivity
 
-my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-# @info "Rank $my_rank: $(dm.connectivity), $(dm.model.grid.zF[end])"
-# @info "u.x BCs: $(dm.model.boundary_conditions.solution.u.x)"
+    # No communication in y and z.
+    @test isnothing(connectivity.south)
+    @test isnothing(connectivity.north)
+    @test isnothing(connectivity.top)
+    @test isnothing(connectivity.bottom)
 
-interior(dm.model.velocities.u) .= rand(8, 8, 8)
-fill_halo_regions!(dm.model.velocities.u.data, dm.model.boundary_conditions.solution.u, CPU(), dm.model.grid)
+    if my_rank == 0
+        @test connectivity.east == 1
+        @test connectivity.west == 3
+    elseif my_rank == 1
+        @test connectivity.east == 2
+        @test connectivity.west == 0
+    elseif my_rank == 2
+        @test connectivity.east == 3
+        @test connectivity.west == 1
+    elseif my_rank == 3
+        @test connectivity.east == 0
+        @test connectivity.west == 2
+    end
+
+    nx, ny, nz = size(model.grid)
+    @test model.grid.xF[1] == 0.25*my_rank
+    @test model.grid.xF[nx+1] == 0.25*(my_rank+1)
+    @test model.grid.yF[1] == 0
+    @test model.grid.yF[ny+1] == 2
+    @test model.grid.zF[1] == -3
+    @test model.grid.zF[nz+1] == 0
+end
+
+function test_triply_periodic_connectivity_with_141_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    dm = DistributedModel(grid=full_grid, ranks=(1, 4, 1))
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test my_rank == index2rank(dm.index..., dm.ranks...)
+
+    model = dm.model
+    connectivity = dm.connectivity
+
+    # No communication in x and z.
+    @test isnothing(connectivity.east)
+    @test isnothing(connectivity.west)
+    @test isnothing(connectivity.top)
+    @test isnothing(connectivity.bottom)
+
+    if my_rank == 0
+        @test connectivity.north == 1
+        @test connectivity.south == 3
+    elseif my_rank == 1
+        @test connectivity.north == 2
+        @test connectivity.south == 0
+    elseif my_rank == 2
+        @test connectivity.north == 3
+        @test connectivity.south == 1
+    elseif my_rank == 3
+        @test connectivity.north == 0
+        @test connectivity.south == 2
+    end
+
+    nx, ny, nz = size(model.grid)
+    @test model.grid.xF[1] == 0
+    @test model.grid.xF[nx+1] == 1
+    @test model.grid.yF[1] == 0.5*my_rank
+    @test model.grid.yF[ny+1] == 0.5*(my_rank+1)
+    @test model.grid.zF[1] == -3
+    @test model.grid.zF[nz+1] == 0
+end
+
+function test_triply_periodic_connectivity_with_114_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    dm = DistributedModel(grid=full_grid, ranks=(1, 1, 4))
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test my_rank == index2rank(dm.index..., dm.ranks...)
+
+    model = dm.model
+    connectivity = dm.connectivity
+
+    # No communication in x and y.
+    @test isnothing(connectivity.east)
+    @test isnothing(connectivity.west)
+    @test isnothing(connectivity.north)
+    @test isnothing(connectivity.south)
+
+    if my_rank == 0
+        @test connectivity.top == 1
+        @test connectivity.bottom == 3
+    elseif my_rank == 1
+        @test connectivity.top == 2
+        @test connectivity.bottom == 0
+    elseif my_rank == 2
+        @test connectivity.top == 3
+        @test connectivity.bottom == 1
+    elseif my_rank == 3
+        @test connectivity.top == 0
+        @test connectivity.bottom == 2
+    end
+
+    nx, ny, nz = size(model.grid)
+    @test model.grid.xF[1] == 0
+    @test model.grid.xF[nx+1] == 1
+    @test model.grid.yF[1] == 0
+    @test model.grid.yF[ny+1] == 2
+    @test model.grid.zF[1] == -3 + 0.75*my_rank
+    @test model.grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
+end
+
+@testset "Distributed MPI Oceananigans" begin
+    test_triply_periodic_connectivity_with_411_ranks()
+    test_triply_periodic_connectivity_with_141_ranks()
+    test_triply_periodic_connectivity_with_114_ranks()
+end
+
+# MPI.Finalize()
+# @test MPI.Finalized()
 
-display(interior(dm.model.velocities.u))

From 571eba539513076aa549a6e06133ae28b1430931 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 15:09:35 -0500
Subject: [PATCH 045/100] Properly inject halo communication BCs

---
 src/Distributed/distributed_model.jl      | 72 +++++++++++++++--------
 src/Distributed/test_distributed_model.jl | 30 +++++++++-
 2 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 777571bf9b..5c426e98f9 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -7,6 +7,7 @@ using Oceananigans.Grids: validate_tupled_argument
 using Oceananigans.BoundaryConditions: BCType
 
 import Oceananigans.BoundaryConditions:
+    bctype_str, print_condition,
     fill_west_halo!, fill_east_halo!, fill_south_halo!,
     fill_north_halo!, fill_bottom_halo!, fill_top_halo!
 
@@ -28,8 +29,6 @@ end
 ##### Rank connectivity graph
 #####
 
-const Connectivity = NamedTuple{(:east, :west, :north, :south, :top, :bottom)}
-
 struct RankConnectivity{E, W, N, S, T, B}
       east :: E
       west :: W
@@ -97,20 +96,22 @@ end
 
 struct HaloCommunication <: BCType end
 
-const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+# const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+
+bctype_str(::HaloCommunicationBC) ="HaloCommunication"
 
 HaloCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(HaloCommunication, val; kwargs...)
 
-struct HaloCommunicationRanks{T}
-    from :: T
+struct HaloCommunicationRanks{F, T}
+    from :: F
       to :: T
 end
 
 HaloCommunicationRanks(; from, to) = HaloCommunicationRanks(from, to)
 
-function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
-    new_field_bcs = []
+print_condition(hcr::HaloCommunicationRanks) = "(from rank $(hcr.from), to rank $(hcr.to))"
 
+function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
     rank_east = connectivity.east
     rank_west = connectivity.west
     rank_north = connectivity.north
@@ -132,14 +133,14 @@ function inject_halo_communication_boundary_conditions(field_bcs, my_rank, conne
     top_comm_bc = HaloCommunicationBoundaryCondition(top_comm_ranks)
     bottom_comm_bc = HaloCommunicationBoundaryCondition(bottom_comm_ranks)
 
-    x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.x.left : west_comm_bc,
-                                         isnothing(rank_east) ? field_bcs.x.right : east_comm_bc)
+    x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.west : west_comm_bc,
+                                         isnothing(rank_east) ? field_bcs.east : east_comm_bc)
 
-    y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.y.south : south_comm_bc,
-                                         isnothing(rank_north) ? field_bcs.y.north : north_comm_bc)
+    y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.south : south_comm_bc,
+                                         isnothing(rank_north) ? field_bcs.north : north_comm_bc)
 
-    z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.z.bottom : bottom_comm_bc,
-                                         isnothing(rank_top) ? field_bcs.z.top : top_comm_bc)
+    z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.bottom : bottom_comm_bc,
+                                         isnothing(rank_top) ? field_bcs.top : top_comm_bc)
 
     return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
 end
@@ -235,10 +236,10 @@ end
 #####
 
 struct DistributedModel{I, A, R, G}
-                 index :: I
-                 ranks :: R
-                 model :: A
-          connectivity :: G
+           index :: I
+           ranks :: R
+           model :: A
+    connectivity :: G
 end
 
 """
@@ -249,7 +250,7 @@ x, y, z: Left and right endpoints for each dimension.
 ranks: Number of ranks in each dimension.
 model_kwargs: Passed to `Model` constructor.
 """
-function DistributedModel(; grid, ranks, model_kwargs...)
+function DistributedModel(; grid, ranks, boundary_conditions=nothing, model_kwargs...)
     validate_tupled_argument(ranks, Int, "ranks")
 
     Nx, Ny, Nz = size(grid)
@@ -276,7 +277,7 @@ function DistributedModel(; grid, ranks, model_kwargs...)
     end
 
     i, j, k = index = rank2index(my_rank, Rx, Ry, Rz)
-    @debug "My rank: $my_rank, my index: $index"
+    @info "My rank: $my_rank, my index: $index"
 
     #####
     ##### Construct local grid
@@ -294,7 +295,7 @@ function DistributedModel(; grid, ranks, model_kwargs...)
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
-    @debug "Constructing local grid: n=($nx, $ny, $nz), x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
+    @info "Constructing local grid: n=($nx, $ny, $nz), x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
     my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
     #####
@@ -302,20 +303,43 @@ function DistributedModel(; grid, ranks, model_kwargs...)
     #####
 
     my_connectivity = RankConnectivity(index, ranks, topology(grid))
-    @debug "Local connectivity: $my_connectivity"
+    @info "Local connectivity: $my_connectivity"
 
     #####
     ##### Change appropriate boundary conditions to halo communication BCs
     #####
 
-    # @debug "Injecting halo communication boundary conditions..."
-    # boundary_conditions_with_communication = inject_halo_communication_boundary_conditions(boundary_conditions, my_rank, my_connectivity)
+    # FIXME: Stop assuming (u, v, w, T, S).
+
+    bcs = isnothing(boundary_conditions) ? NamedTuple() : boundary_conditions
+
+    bcs = (
+        u = haskey(bcs, :u) ? bcs.u : UVelocityBoundaryConditions(grid),
+        v = haskey(bcs, :v) ? bcs.v : VVelocityBoundaryConditions(grid),
+        w = haskey(bcs, :w) ? bcs.w : WVelocityBoundaryConditions(grid),
+        T = haskey(bcs, :T) ? bcs.T : TracerBoundaryConditions(grid),
+        S = haskey(bcs, :S) ? bcs.S : TracerBoundaryConditions(grid)
+    )
+
+    @debug "Injecting halo communication boundary conditions..."
+
+    communicative_bcs = (
+        u = inject_halo_communication_boundary_conditions(bcs.u, my_rank, my_connectivity),
+        v = inject_halo_communication_boundary_conditions(bcs.v, my_rank, my_connectivity),
+        w = inject_halo_communication_boundary_conditions(bcs.w, my_rank, my_connectivity),
+        T = inject_halo_communication_boundary_conditions(bcs.T, my_rank, my_connectivity),
+        S = inject_halo_communication_boundary_conditions(bcs.S, my_rank, my_connectivity)
+    )
 
     #####
     ##### Construct local model
     #####
 
-    my_model = IncompressibleModel(; grid=my_grid, model_kwargs...)
+    my_model = IncompressibleModel(; grid=my_grid, boundary_conditions=communicative_bcs, model_kwargs...)
 
     return DistributedModel(index, ranks, my_model, my_connectivity)
 end
+
+function Base.show(io::IO, dm::DistributedModel)
+    print(io, "DistributedModel with $(dm.ranks) ranks")
+end
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 0e08689a9e..b9561e685e 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -47,6 +47,15 @@ function test_triply_periodic_connectivity_with_411_ranks()
     @test model.grid.yF[ny+1] == 2
     @test model.grid.zF[1] == -3
     @test model.grid.zF[nz+1] == 0
+
+    for field in fields(model)
+        @test field.boundary_conditions.east isa HaloCommunicationBC
+        @test field.boundary_conditions.west isa HaloCommunicationBC
+        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+    end
 end
 
 function test_triply_periodic_connectivity_with_141_ranks()
@@ -87,6 +96,15 @@ function test_triply_periodic_connectivity_with_141_ranks()
     @test model.grid.yF[ny+1] == 0.5*(my_rank+1)
     @test model.grid.zF[1] == -3
     @test model.grid.zF[nz+1] == 0
+
+    for field in fields(model)
+        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
+        @test field.boundary_conditions.north isa HaloCommunicationBC
+        @test field.boundary_conditions.south isa HaloCommunicationBC
+        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+    end
 end
 
 function test_triply_periodic_connectivity_with_114_ranks()
@@ -127,14 +145,24 @@ function test_triply_periodic_connectivity_with_114_ranks()
     @test model.grid.yF[ny+1] == 2
     @test model.grid.zF[1] == -3 + 0.75*my_rank
     @test model.grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
+
+    for field in fields(model)
+        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
+        @test field.boundary_conditions.top isa HaloCommunicationBC
+        @test field.boundary_conditions.bottom isa HaloCommunicationBC
+    end
 end
 
 @testset "Distributed MPI Oceananigans" begin
     test_triply_periodic_connectivity_with_411_ranks()
     test_triply_periodic_connectivity_with_141_ranks()
     test_triply_periodic_connectivity_with_114_ranks()
+    # TODO: 221 ranks
+    # TODO: triply bounded
 end
 
 # MPI.Finalize()
 # @test MPI.Finalized()
-

From ea72c5d9508b5e5b1aef0086ac293b5586476af4 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 17:34:39 -0500
Subject: [PATCH 046/100] New multi-architectures

---
 src/Distributed/distributed_model.jl      | 40 ++++++++++++++++-------
 src/Distributed/test_distributed_model.jl | 11 +++++--
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 5c426e98f9..179eb69949 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -1,6 +1,7 @@
 import MPI
 
 using Oceananigans
+using Oceananigans.Architectures
 using Oceananigans.Grids
 
 using Oceananigans.Grids: validate_tupled_argument
@@ -11,6 +12,21 @@ import Oceananigans.BoundaryConditions:
     fill_west_halo!, fill_east_halo!, fill_south_halo!,
     fill_north_halo!, fill_bottom_halo!, fill_top_halo!
 
+#####
+##### Architecture stuff
+#####
+
+# TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
+#       Might have to make it MultiCPU(; grid, ranks)
+
+struct MultiCPU{R} <: AbstractArchitecture
+    ranks :: R
+end
+
+MultiCPU(; ranks) = MultiCPU(ranks)
+
+child_architecture(::MultiCPU) = CPU()
+
 #####
 ##### Converting between index and MPI rank taking k as the fast index
 #####
@@ -235,22 +251,17 @@ end
 ##### Distributed model struct and constructor
 #####
 
-struct DistributedModel{I, A, R, G}
+struct DistributedModel{A, I, M, R, G}
+    architecture :: A
            index :: I
            ranks :: R
-           model :: A
+           model :: M
     connectivity :: G
 end
 
-"""
-    DistributedModel(size, x, y, z, ranks, model_kwargs...)
+function DistributedModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
+    ranks = architecture.ranks
 
-size: Number of total grid points.
-x, y, z: Left and right endpoints for each dimension.
-ranks: Number of ranks in each dimension.
-model_kwargs: Passed to `Model` constructor.
-"""
-function DistributedModel(; grid, ranks, boundary_conditions=nothing, model_kwargs...)
     validate_tupled_argument(ranks, Int, "ranks")
 
     Nx, Ny, Nz = size(grid)
@@ -335,9 +346,14 @@ function DistributedModel(; grid, ranks, boundary_conditions=nothing, model_kwar
     ##### Construct local model
     #####
 
-    my_model = IncompressibleModel(; grid=my_grid, boundary_conditions=communicative_bcs, model_kwargs...)
+    my_model = IncompressibleModel(;
+               architecture = child_architecture(architecture),
+                       grid = my_grid,
+        boundary_conditions = communicative_bcs,
+        model_kwargs...
+    )
 
-    return DistributedModel(index, ranks, my_model, my_connectivity)
+    return DistributedModel(architecture, index, ranks, my_model, my_connectivity)
 end
 
 function Base.show(io::IO, dm::DistributedModel)
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index b9561e685e..4e515b6631 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -2,6 +2,8 @@ using Test
 using MPI
 using Oceananigans
 
+using Oceananigans.BoundaryConditions: fill_halo_regions!
+
 MPI.Initialized() || MPI.Init()
 comm = MPI.COMM_WORLD
 
@@ -12,7 +14,8 @@ mpi_ranks = MPI.Comm_size(comm)
 function test_triply_periodic_connectivity_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    dm = DistributedModel(grid=full_grid, ranks=(4, 1, 1))
+    arch = MultiCPU(ranks=(4, 1, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test my_rank == index2rank(dm.index..., dm.ranks...)
@@ -61,7 +64,8 @@ end
 function test_triply_periodic_connectivity_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    dm = DistributedModel(grid=full_grid, ranks=(1, 4, 1))
+    arch = MultiCPU(ranks=(1, 4, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test my_rank == index2rank(dm.index..., dm.ranks...)
@@ -110,7 +114,8 @@ end
 function test_triply_periodic_connectivity_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    dm = DistributedModel(grid=full_grid, ranks=(1, 1, 4))
+    arch = MultiCPU(ranks=(1, 1, 4))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test my_rank == index2rank(dm.index..., dm.ranks...)

From c92dbb8f7a9b762de920ba33a88ac795d089faf3 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 18:35:32 -0500
Subject: [PATCH 047/100] Success communicating east/west halos!

---
 src/Distributed/distributed_model.jl | 231 +++++++++++++++++----------
 1 file changed, 148 insertions(+), 83 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 179eb69949..21f7b5159b 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -4,11 +4,13 @@ using Oceananigans
 using Oceananigans.Architectures
 using Oceananigans.Grids
 
+using KernelAbstractions: @kernel, @index, Event, MultiEvent
 using Oceananigans.Grids: validate_tupled_argument
 using Oceananigans.BoundaryConditions: BCType
 
 import Oceananigans.BoundaryConditions:
     bctype_str, print_condition,
+    fill_halo_regions!,
     fill_west_halo!, fill_east_halo!, fill_south_halo!,
     fill_north_halo!, fill_bottom_halo!, fill_top_halo!
 
@@ -19,7 +21,9 @@ import Oceananigans.BoundaryConditions:
 # TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
 #       Might have to make it MultiCPU(; grid, ranks)
 
-struct MultiCPU{R} <: AbstractArchitecture
+abstract type AbstractMultiArchitecture <: AbstractArchitecture end
+
+struct MultiCPU{R} <: AbstractMultiArchitecture
     ranks :: R
 end
 
@@ -112,7 +116,7 @@ end
 
 struct HaloCommunication <: BCType end
 
-# const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
 
 bctype_str(::HaloCommunicationBC) ="HaloCommunication"
 
@@ -161,91 +165,152 @@ function inject_halo_communication_boundary_conditions(field_bcs, my_rank, conne
     return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
 end
 
+#####
+##### MPI tags for halo communication BCs
+#####
+
+sides  = (:west, :east, :south, :north, :top, :bottom)
+
+side_id = Dict(
+    :east => 1, :west => 2,
+    :north => 3, :south => 4,
+    :top => 5, :bottom => 6
+)
+
+opposite_side = Dict(
+    :east => :west, :west => :east,
+    :north => :south, :south => :north,
+    :top => :bottom, :bottom => :top
+)
+
+# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
+const MAX_RANKS = 10^3
+RANK_DIGITS = 3
+
+# Define functions that return unique send and recv MPI tags for each side.
+for side in sides
+    side_str = string(side)
+    send_tag_fn_name = Symbol(side, :_send_tag)
+    recv_tag_fn_name = Symbol(side, :_recv_tag)
+    @eval begin
+        function $send_tag_fn_name(bc)
+            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
+            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+            side_digit = string(side_id[Symbol($side_str)])
+            return parse(Int, from_digits * to_digits * side_digit)
+        end
+
+        function $recv_tag_fn_name(bc)
+            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
+            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+            side_digit = string(side_id[opposite_side[Symbol($side_str)]])
+            return parse(Int, to_digits * from_digits * side_digit)
+        end
+    end
+end
+
 #####
 ##### Filling halos for halo communication boundary conditions
 #####
 
-# sides  = (:west, :east, :south, :north, :top, :bottom)
-# coords = (:x,    :x,    :y,     :y,     :z,   :z)
-
-# # Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-# const MAX_RANKS = 10^3
-
-# # Define functions that return unique send and recv MPI tags for each side.
-# for (i, side) in enumerate(sides)
-#     send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
-#     recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
-#     @eval begin
-#         @inline $send_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_from + bc.condition.rank_to)   + $i
-#         @inline $recv_tag_fn_name(bc) = 6 * (MAX_RANKS * bc.condition.rank_to   + bc.condition.rank_from) + $i
-#     end
-# end
-
-# @inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
-# @inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
-# @inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
-# @inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
-# @inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
-# @inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
-
-# @inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ty, grid.Tz)
-# @inline south_recv_buffer(grid) = zeros(grid.Tx, grid.Hy, grid.Tz)
-# @inline top_recv_buffer(grid)   = zeros(grid.Tx, grid.Ty, grid.Hz)
-
-# const   east_recv_buffer =  west_recv_buffer
-# const  north_recv_buffer = south_recv_buffer
-# const bottom_recv_buffer =   top_recv_buffer
-
-# @inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
-# @inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
-# @inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
-# @inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
-# @inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
-# @inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
-
-# for (x, side) in zip(coords, sides)
-#     H = Symbol(:H, x)
-#     N = Symbol(:N, x)
-
-#     fill_fn_name     = Symbol(:fill_, side, :_halo!)
-#     send_buf_fn_name = Symbol(side, :_send_buffer)
-#     recv_buf_fn_name = Symbol(side, :_recv_buffer)
-#     send_tag_fn_name = Symbol(side, :_halo_comm_bc_send_tag)
-#     recv_tag_fn_name = Symbol(side, :_halo_comm_bc_recv_tag)
-#     copy_buf_fn_name = Symbol(:copy_recv_buffer_into_, side, :_halo!)
-
-#     @eval begin
-#         function $fill_fn_name(c, bc::HaloCommunicationBC, arch, grid, args...)
-#             send_buffer = $send_buf_fn_name(c, grid.$(N), grid.$(H))
-#             recv_buffer = $recv_buf_fn_name(grid)
-
-#             send_tag = $send_tag_fn_name(bc)
-#             recv_tag = $recv_tag_fn_name(bc)
-
-#             my_rank = bc.condition.rank_from
-#             rank_send_to = rank_recv_from = bc.condition.rank_to
-
-#             @info "MPI.Isend: my_rank=$my_rank, rank_send_to=$rank_send_to, send_tag=$send_tag"
-#             MPI.Isend(send_buffer, rank_send_to, send_tag, MPI.COMM_WORLD)
-#             @info "MPI.Isend: done!"
-
-#             @info "MPI.Recv!: my_rank=$my_rank, rank_recv_from=$rank_recv_from, recv_tag=$recv_tag"
-#             MPI.Recv!(recv_buffer, rank_recv_from, recv_tag, MPI.COMM_WORLD)
-#             @info "MPI.Recv! done!"
-
-#             # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
-#             #       "send_tag=$send_tag, recv_tag=$recv_tag"
-#             #
-#             # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-#             #               recv_buffer, rank_recv_from, recv_tag,
-#             #               MPI.COMM_WORLD)
-#             #
-#             # @info "Sendrecv!: my_rank=$my_rank done!"
-
-#             $copy_buf_fn_name(c, grid.$(N), grid.$(H), recv_buffer)
-#         end
-#     end
-# end
+@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
+@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
+@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
+@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
+@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
+@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
+
+@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ny + 2grid.Hy, grid.Nz + 2grid.Hz)
+@inline south_recv_buffer(grid) = zeros(grid.Nx + 2grid.Hx, grid.Hy, grid.Nz + 2grid.Hz)
+@inline top_recv_buffer(grid)   = zeros(grid.Nx + 2grid.Hx, grid.Ny + 2grid.Hy, grid.Hz)
+
+const   east_recv_buffer =  west_recv_buffer
+const  north_recv_buffer = south_recv_buffer
+const bottom_recv_buffer =   top_recv_buffer
+
+@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
+@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
+@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
+@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
+@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
+@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
+
+function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, args...)
+
+    barrier = Event(device(child_architecture(arch)))
+
+    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+    # north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, args...)
+    # top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+
+    events = [east_event, west_event] # , north_event, south_event, top_event, bottom_event]
+    events = filter(e -> e isa Event, events)
+    wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
+
+    return nothing
+end
+
+function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, args...)
+    east_event = fill_east_halo!(c, east_bc, child_architecture(arch), barrier, grid, args...)
+    west_event = fill_west_halo!(c, west_bc, child_architecture(arch), barrier, grid, args...)
+    return east_event, west_event
+end
+
+function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
+    # 1 -> send east halo to eastern rank and fill east halo from eastern rank's west halo.
+    # 2 -> send west halo to western rank and fill west halo from western rank's east halo.
+
+    @assert east_bc.condition.from == west_bc.condition.from
+    my_rank = east_bc.condition.from
+
+    rank_to_send_to1 = east_bc.condition.to
+    rank_to_send_to2 = west_bc.condition.to
+
+    send_buffer1 = east_send_buffer(c, grid.Nx, grid.Hx)
+    send_buffer2 = west_send_buffer(c, grid.Nx, grid.Hx)
+
+    send_tag1 = east_send_tag(east_bc)
+    send_tag2 = west_send_tag(west_bc)
+
+    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to1, send_tag=$send_tag1"
+    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to2, send_tag=$send_tag2"
+
+    send_req1 = MPI.Isend(send_buffer1, rank_to_send_to1, send_tag1, MPI.COMM_WORLD)
+    send_req2 = MPI.Isend(send_buffer2, rank_to_send_to2, send_tag2, MPI.COMM_WORLD)
+
+    ###
+
+    rank_to_recv_from1 = east_bc.condition.to
+    rank_to_recv_from2 = west_bc.condition.to
+
+    recv_buffer1 = east_recv_buffer(grid)
+    recv_buffer2 = west_recv_buffer(grid)
+
+    recv_tag1 = east_recv_tag(east_bc)
+    recv_tag2 = west_recv_tag(west_bc)
+
+    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from1, recv_tag=$recv_tag1"
+    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from2, recv_tag=$recv_tag2"
+
+    MPI.Recv!(recv_buffer1, rank_to_recv_from1, recv_tag1, MPI.COMM_WORLD)
+    MPI.Recv!(recv_buffer2, rank_to_recv_from2, recv_tag2, MPI.COMM_WORLD)
+
+    @info "Communication done!"
+
+    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer1)
+    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer2)
+
+    # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
+    #       "send_tag=$send_tag, recv_tag=$recv_tag"
+    #
+    # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+    #               recv_buffer, rank_recv_from, recv_tag,
+    #               MPI.COMM_WORLD)
+    #
+    # @info "Sendrecv!: my_rank=$my_rank done!"
+
+    return nothing, nothing
+end
 
 #####
 ##### Distributed model struct and constructor

From 50fc38793f85e1559a69bb499d714f9420a6907c Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 19:33:14 -0500
Subject: [PATCH 048/100] More responsibilities for `MultiCPU`

---
 src/Distributed/distributed_architectures.jl | 143 +++++++++++++++++
 src/Distributed/distributed_model.jl         | 158 ++-----------------
 2 files changed, 155 insertions(+), 146 deletions(-)
 create mode 100644 src/Distributed/distributed_architectures.jl

diff --git a/src/Distributed/distributed_architectures.jl b/src/Distributed/distributed_architectures.jl
new file mode 100644
index 0000000000..400cab3908
--- /dev/null
+++ b/src/Distributed/distributed_architectures.jl
@@ -0,0 +1,143 @@
+using Oceananigans.Grids: validate_tupled_argument
+
+# TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
+#       Might have to make it MultiCPU(; grid, ranks)
+
+abstract type AbstractMultiArchitecture <: AbstractArchitecture end
+
+struct MultiCPU{R, I, ρ, C} <: AbstractMultiArchitecture
+         my_rank :: R
+        my_index :: I
+           ranks :: ρ
+    connectivity :: C
+end
+
+child_architecture(::MultiCPU) = CPU()
+
+#####
+##### Converting between index and MPI rank taking k as the fast index
+#####
+
+@inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
+
+@inline function rank2index(r, Rx, Ry, Rz)
+    i = div(r, Ry*Rz)
+    r -= i*Ry*Rz
+    j = div(r, Rz)
+    k = mod(r, Rz)
+    return i+1, j+1, k+1  # 1-based Julia
+end
+
+#####
+##### Rank connectivity graph
+#####
+
+struct RankConnectivity{E, W, N, S, T, B}
+    east :: E
+    west :: W
+   north :: N
+   south :: S
+     top :: T
+  bottom :: B
+end
+
+RankConnectivity(; east, west, north, south, top, bottom) =
+  RankConnectivity(east, west, north, south, top, bottom)
+
+function increment_index(i, R, topo)
+  R == 1 && return nothing
+  if i+1 > R
+      if topo == Periodic
+          return 1
+      else
+          return nothing
+      end
+  else
+      return i+1
+  end
+end
+
+function decrement_index(i, R, topo)
+  R == 1 && return nothing
+  if i-1 < 1
+      if topo == Periodic
+          return R
+      else
+          return nothing
+      end
+  else
+      return i-1
+  end
+end
+
+function RankConnectivity(model_index, ranks, topology)
+  i, j, k = model_index
+  Rx, Ry, Rz = ranks
+  TX, TY, TZ = topology
+
+  i_east  = increment_index(i, Rx, TX)
+  i_west  = decrement_index(i, Rx, TX)
+  j_north = increment_index(j, Ry, TY)
+  j_south = decrement_index(j, Ry, TY)
+  k_top   = increment_index(k, Rz, TZ)
+  k_bot   = decrement_index(k, Rz, TZ)
+
+  r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
+  r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+  r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
+  r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
+  r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
+  r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+
+  return RankConnectivity(east=r_east, west=r_west, north=r_north,
+                          south=r_south, top=r_top, bottom=r_bot)
+end
+
+#####
+##### Constructors
+#####
+
+function MultiCPU(; grid, ranks)
+    MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
+
+    validate_tupled_argument(ranks, Int, "ranks")
+
+    Rx, Ry, Rz = ranks
+    total_ranks = Rx*Ry*Rz
+
+    comm = MPI.COMM_WORLD
+
+    mpi_ranks = MPI.Comm_size(comm)
+    my_rank   = MPI.Comm_rank(comm)
+
+    i, j, k = my_index = rank2index(my_rank, Rx, Ry, Rz)
+
+    if total_ranks != mpi_ranks
+        throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
+                            "with number of MPI ranks: $mpi_ranks. Exiting with return code 1."))
+        MPI.Finalize()
+        exit(code=1)
+    end
+
+    comm = MPI.COMM_WORLD
+
+    my_connectivity = RankConnectivity(my_index, ranks, topology(grid))
+
+    return MultiCPU(my_rank, my_index, ranks, my_connectivity)
+end
+
+#####
+##### Pretty printing
+#####
+
+function Base.show(io::IO, arch::MultiCPU)
+    c = arch.connectivity
+    print(io, "MultiCPU architecture (rank $(arch.my_rank)/$(prod(arch.ranks))) [index $(arch.my_index) / $(arch.ranks)]\n",
+              "└── connectivity:",
+              isnothing(c.east) ? "" : " east=$(c.east)",
+              isnothing(c.west) ? "" : " west=$(c.west)",
+              isnothing(c.north) ? "" : " north=$(c.north)",
+              isnothing(c.south) ? "" : " south=$(c.south)",
+              isnothing(c.top) ? "" : " top=$(c.top)",
+              isnothing(c.bottom) ? "" : " bottom=$(c.bottom)")
+end
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 21f7b5159b..24ff6b5f8a 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -5,7 +5,6 @@ using Oceananigans.Architectures
 using Oceananigans.Grids
 
 using KernelAbstractions: @kernel, @index, Event, MultiEvent
-using Oceananigans.Grids: validate_tupled_argument
 using Oceananigans.BoundaryConditions: BCType
 
 import Oceananigans.BoundaryConditions:
@@ -14,101 +13,8 @@ import Oceananigans.BoundaryConditions:
     fill_west_halo!, fill_east_halo!, fill_south_halo!,
     fill_north_halo!, fill_bottom_halo!, fill_top_halo!
 
-#####
-##### Architecture stuff
-#####
-
-# TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
-#       Might have to make it MultiCPU(; grid, ranks)
-
-abstract type AbstractMultiArchitecture <: AbstractArchitecture end
-
-struct MultiCPU{R} <: AbstractMultiArchitecture
-    ranks :: R
-end
-
-MultiCPU(; ranks) = MultiCPU(ranks)
-
-child_architecture(::MultiCPU) = CPU()
-
-#####
-##### Converting between index and MPI rank taking k as the fast index
-#####
+include("distributed_architectures.jl")
 
-@inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
-
-@inline function rank2index(r, Rx, Ry, Rz)
-    i = div(r, Ry*Rz)
-    r -= i*Ry*Rz
-    j = div(r, Rz)
-    k = mod(r, Rz)
-    return i+1, j+1, k+1
-end
-
-#####
-##### Rank connectivity graph
-#####
-
-struct RankConnectivity{E, W, N, S, T, B}
-      east :: E
-      west :: W
-     north :: N
-     south :: S
-       top :: T
-    bottom :: B
-end
-
-RankConnectivity(; east, west, north, south, top, bottom) =
-    RankConnectivity(east, west, north, south, top, bottom)
-
-function increment_index(i, R, topo)
-    R == 1 && return nothing
-    if i+1 > R
-        if topo == Periodic
-            return 1
-        else
-            return nothing
-        end
-    else
-        return i+1
-    end
-end
-
-function decrement_index(i, R, topo)
-    R == 1 && return nothing
-    if i-1 < 1
-        if topo == Periodic
-            return R
-        else
-            return nothing
-        end
-    else
-        return i-1
-    end
-end
-
-function RankConnectivity(model_index, ranks, topology)
-    i, j, k = model_index
-    Rx, Ry, Rz = ranks
-    TX, TY, TZ = topology
-
-    i_east  = increment_index(i, Rx, TX)
-    i_west  = decrement_index(i, Rx, TX)
-    j_north = increment_index(j, Ry, TY)
-    j_south = decrement_index(j, Ry, TY)
-    k_top   = increment_index(k, Rz, TZ)
-    k_bot   = decrement_index(k, Rz, TZ)
-
-    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
-    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
-    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
-    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-    r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
-    r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
-
-    return RankConnectivity(east=r_east, west=r_west, north=r_north,
-                            south=r_south, top=r_top, bottom=r_bot)
-end
 
 #####
 ##### Halo communication boundary condition
@@ -278,8 +184,6 @@ function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::Hal
     send_req1 = MPI.Isend(send_buffer1, rank_to_send_to1, send_tag1, MPI.COMM_WORLD)
     send_req2 = MPI.Isend(send_buffer2, rank_to_send_to2, send_tag2, MPI.COMM_WORLD)
 
-    ###
-
     rank_to_recv_from1 = east_bc.condition.to
     rank_to_recv_from2 = west_bc.condition.to
 
@@ -316,18 +220,20 @@ end
 ##### Distributed model struct and constructor
 #####
 
-struct DistributedModel{A, I, M, R, G}
+# TODO: add the full grid!
+
+struct DistributedModel{A, M}
     architecture :: A
-           index :: I
-           ranks :: R
            model :: M
-    connectivity :: G
 end
 
 function DistributedModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
-    ranks = architecture.ranks
+    my_rank = architecture.my_rank
+    i, j, k = architecture.my_index
+    Rx, Ry, Rz = architecture.ranks
+    my_connectivity = architecture.connectivity
 
-    validate_tupled_argument(ranks, Int, "ranks")
+    ## Construct local grid
 
     Nx, Ny, Nz = size(grid)
 
@@ -337,28 +243,6 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     zL, zR = grid.zF[1], grid.zF[Nz+1]
     Lx, Ly, Lz = length(grid)
 
-    Rx, Ry, Rz = ranks
-    total_ranks = Rx*Ry*Rz
-
-    comm = MPI.COMM_WORLD
-
-    mpi_ranks = MPI.Comm_size(comm)
-    my_rank   = MPI.Comm_rank(comm)
-
-    if total_ranks != mpi_ranks
-        throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
-                            "with number of MPI ranks: $mpi_ranks. Exiting with return code 1."))
-        MPI.Finalize()
-        exit(code=1)
-    end
-
-    i, j, k = index = rank2index(my_rank, Rx, Ry, Rz)
-    @info "My rank: $my_rank, my index: $index"
-
-    #####
-    ##### Construct local grid
-    #####
-
     # Make sure we can put an integer number of grid points in each rank.
     @assert isinteger(Nx / Rx)
     @assert isinteger(Ny / Ry)
@@ -371,19 +255,9 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
-    @info "Constructing local grid: n=($nx, $ny, $nz), x ∈ [$x₁, $x₂], y ∈ [$y₁, $y₂], z ∈ [$z₁, $z₂]"
     my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
-    #####
-    ##### Construct local connectivity
-    #####
-
-    my_connectivity = RankConnectivity(index, ranks, topology(grid))
-    @info "Local connectivity: $my_connectivity"
-
-    #####
-    ##### Change appropriate boundary conditions to halo communication BCs
-    #####
+    ## Change appropriate boundary conditions to halo communication BCs
 
     # FIXME: Stop assuming (u, v, w, T, S).
 
@@ -397,8 +271,6 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
         S = haskey(bcs, :S) ? bcs.S : TracerBoundaryConditions(grid)
     )
 
-    @debug "Injecting halo communication boundary conditions..."
-
     communicative_bcs = (
         u = inject_halo_communication_boundary_conditions(bcs.u, my_rank, my_connectivity),
         v = inject_halo_communication_boundary_conditions(bcs.v, my_rank, my_connectivity),
@@ -407,9 +279,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
         S = inject_halo_communication_boundary_conditions(bcs.S, my_rank, my_connectivity)
     )
 
-    #####
-    ##### Construct local model
-    #####
+    ## Construct local model
 
     my_model = IncompressibleModel(;
                architecture = child_architecture(architecture),
@@ -418,9 +288,5 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
         model_kwargs...
     )
 
-    return DistributedModel(architecture, index, ranks, my_model, my_connectivity)
-end
-
-function Base.show(io::IO, dm::DistributedModel)
-    print(io, "DistributedModel with $(dm.ranks) ranks")
+    return DistributedModel(architecture, my_model)
 end

From a04c583baba25f719cae4bbf4f5a7fcbe53d781a Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 19:33:48 -0500
Subject: [PATCH 049/100] Modular tests

---
 src/Distributed/test_distributed_model.jl | 197 ++++++++++++++++------
 1 file changed, 143 insertions(+), 54 deletions(-)

diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 4e515b6631..1653b284b5 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -11,17 +11,19 @@ comm = MPI.COMM_WORLD
 mpi_ranks = MPI.Comm_size(comm)
 @assert mpi_ranks == 4
 
-function test_triply_periodic_connectivity_with_411_ranks()
+#####
+##### Multi architectures and rank connectivity
+#####
+
+function run_triply_periodic_rank_connectivity_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    arch = MultiCPU(ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(dm.index..., dm.ranks...)
+    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
 
-    model = dm.model
-    connectivity = dm.connectivity
+    connectivity = arch.connectivity
 
     # No communication in y and z.
     @test isnothing(connectivity.south)
@@ -43,35 +45,18 @@ function test_triply_periodic_connectivity_with_411_ranks()
         @test connectivity.west == 2
     end
 
-    nx, ny, nz = size(model.grid)
-    @test model.grid.xF[1] == 0.25*my_rank
-    @test model.grid.xF[nx+1] == 0.25*(my_rank+1)
-    @test model.grid.yF[1] == 0
-    @test model.grid.yF[ny+1] == 2
-    @test model.grid.zF[1] == -3
-    @test model.grid.zF[nz+1] == 0
-
-    for field in fields(model)
-        @test field.boundary_conditions.east isa HaloCommunicationBC
-        @test field.boundary_conditions.west isa HaloCommunicationBC
-        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
-    end
+    return nothing
 end
 
-function test_triply_periodic_connectivity_with_141_ranks()
+function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    arch = MultiCPU(ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(dm.index..., dm.ranks...)
+    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
 
-    model = dm.model
-    connectivity = dm.connectivity
+    connectivity = arch.connectivity
 
     # No communication in x and z.
     @test isnothing(connectivity.east)
@@ -93,35 +78,18 @@ function test_triply_periodic_connectivity_with_141_ranks()
         @test connectivity.south == 2
     end
 
-    nx, ny, nz = size(model.grid)
-    @test model.grid.xF[1] == 0
-    @test model.grid.xF[nx+1] == 1
-    @test model.grid.yF[1] == 0.5*my_rank
-    @test model.grid.yF[ny+1] == 0.5*(my_rank+1)
-    @test model.grid.zF[1] == -3
-    @test model.grid.zF[nz+1] == 0
-
-    for field in fields(model)
-        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
-        @test field.boundary_conditions.north isa HaloCommunicationBC
-        @test field.boundary_conditions.south isa HaloCommunicationBC
-        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
-    end
+    return nothing
 end
 
-function test_triply_periodic_connectivity_with_114_ranks()
+function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    arch = MultiCPU(ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(dm.index..., dm.ranks...)
+    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
 
-    model = dm.model
-    connectivity = dm.connectivity
+    connectivity = arch.connectivity
 
     # No communication in x and y.
     @test isnothing(connectivity.east)
@@ -143,6 +111,62 @@ function test_triply_periodic_connectivity_with_114_ranks()
         @test connectivity.bottom == 2
     end
 
+    return nothing
+end
+
+#####
+##### Local grids for distributed models
+#####
+
+function run_triply_periodic_local_grid_tests_with_411_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    model = dm.model
+    nx, ny, nz = size(model.grid)
+    @test model.grid.xF[1] == 0.25*my_rank
+    @test model.grid.xF[nx+1] == 0.25*(my_rank+1)
+    @test model.grid.yF[1] == 0
+    @test model.grid.yF[ny+1] == 2
+    @test model.grid.zF[1] == -3
+    @test model.grid.zF[nz+1] == 0
+
+    return nothing
+end
+
+function run_triply_periodic_local_grid_tests_with_141_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    model = dm.model
+    nx, ny, nz = size(model.grid)
+    @test model.grid.xF[1] == 0
+    @test model.grid.xF[nx+1] == 1
+    @test model.grid.yF[1] == 0.5*my_rank
+    @test model.grid.yF[ny+1] == 0.5*(my_rank+1)
+    @test model.grid.zF[1] == -3
+    @test model.grid.zF[nz+1] == 0
+
+    return nothing
+end
+
+function run_triply_periodic_local_grid_tests_with_114_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    model = dm.model
     nx, ny, nz = size(model.grid)
     @test model.grid.xF[1] == 0
     @test model.grid.xF[nx+1] == 1
@@ -151,7 +175,52 @@ function test_triply_periodic_connectivity_with_114_ranks()
     @test model.grid.zF[1] == -3 + 0.75*my_rank
     @test model.grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
 
-    for field in fields(model)
+    return nothing
+end
+
+#####
+#####
+#####
+
+function run_triply_periodic_bc_injection_tests_with_411_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        @test field.boundary_conditions.east isa HaloCommunicationBC
+        @test field.boundary_conditions.west isa HaloCommunicationBC
+        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+    end
+end
+
+function run_triply_periodic_bc_injection_tests_with_141_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
+        @test field.boundary_conditions.north isa HaloCommunicationBC
+        @test field.boundary_conditions.south isa HaloCommunicationBC
+        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
+        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+    end
+end
+
+function run_triply_periodic_bc_injection_tests_with_114_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
         @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
         @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
         @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
@@ -162,9 +231,29 @@ function test_triply_periodic_connectivity_with_114_ranks()
 end
 
 @testset "Distributed MPI Oceananigans" begin
-    test_triply_periodic_connectivity_with_411_ranks()
-    test_triply_periodic_connectivity_with_141_ranks()
-    test_triply_periodic_connectivity_with_114_ranks()
+    @info "Testing distributed MPI Oceananigans..."
+
+    @testset "Multi architectures rank connectivity" begin
+        @info "  Testing multi architecture rank connectivity..."
+        run_triply_periodic_rank_connectivity_tests_with_411_ranks()
+        run_triply_periodic_rank_connectivity_tests_with_141_ranks()
+        run_triply_periodic_rank_connectivity_tests_with_114_ranks()
+    end
+
+    @testset "Local grids for distributed models" begin
+        @info "  Testing local grids for distributed models..."
+        run_triply_periodic_local_grid_tests_with_411_ranks()
+        run_triply_periodic_local_grid_tests_with_141_ranks()
+        run_triply_periodic_local_grid_tests_with_114_ranks()
+    end
+
+    @testset "Injection of halo communication BCs" begin
+        @info "  Testing injection of halo communication BCs..."
+        run_triply_periodic_bc_injection_tests_with_411_ranks()
+        run_triply_periodic_bc_injection_tests_with_141_ranks()
+        run_triply_periodic_bc_injection_tests_with_114_ranks()
+    end
+
     # TODO: 221 ranks
     # TODO: triply bounded
 end

From bb9ab679fbef9b4a2e458c8e4a8d146f751a5f82 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 19:48:36 -0500
Subject: [PATCH 050/100] Need more files

---
 src/Distributed/distributed_architectures.jl |   2 +
 src/Distributed/distributed_model.jl         | 223 +------------------
 src/Distributed/halo_communication.jl        | 151 +++++++++++++
 src/Distributed/halo_communication_bcs.jl    |  54 +++++
 4 files changed, 214 insertions(+), 216 deletions(-)
 create mode 100644 src/Distributed/halo_communication.jl
 create mode 100644 src/Distributed/halo_communication_bcs.jl

diff --git a/src/Distributed/distributed_architectures.jl b/src/Distributed/distributed_architectures.jl
index 400cab3908..796ef3a708 100644
--- a/src/Distributed/distributed_architectures.jl
+++ b/src/Distributed/distributed_architectures.jl
@@ -1,3 +1,5 @@
+using Oceananigans.Architectures
+
 using Oceananigans.Grids: validate_tupled_argument
 
 # TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 24ff6b5f8a..2a5c06593c 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -1,229 +1,19 @@
-import MPI
+using MPI
 
 using Oceananigans
-using Oceananigans.Architectures
 using Oceananigans.Grids
 
-using KernelAbstractions: @kernel, @index, Event, MultiEvent
-using Oceananigans.BoundaryConditions: BCType
-
-import Oceananigans.BoundaryConditions:
-    bctype_str, print_condition,
-    fill_halo_regions!,
-    fill_west_halo!, fill_east_halo!, fill_south_halo!,
-    fill_north_halo!, fill_bottom_halo!, fill_top_halo!
-
 include("distributed_architectures.jl")
-
-
-#####
-##### Halo communication boundary condition
-#####
-
-struct HaloCommunication <: BCType end
-
-HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
-
-bctype_str(::HaloCommunicationBC) ="HaloCommunication"
-
-HaloCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(HaloCommunication, val; kwargs...)
-
-struct HaloCommunicationRanks{F, T}
-    from :: F
-      to :: T
-end
-
-HaloCommunicationRanks(; from, to) = HaloCommunicationRanks(from, to)
-
-print_condition(hcr::HaloCommunicationRanks) = "(from rank $(hcr.from), to rank $(hcr.to))"
-
-function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
-    rank_east = connectivity.east
-    rank_west = connectivity.west
-    rank_north = connectivity.north
-    rank_south = connectivity.south
-    rank_top = connectivity.top
-    rank_bottom = connectivity.bottom
-
-    east_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_east)
-    west_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_west)
-    north_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_north)
-    south_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_south)
-    top_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_top)
-    bottom_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_bottom)
-
-    east_comm_bc = HaloCommunicationBoundaryCondition(east_comm_ranks)
-    west_comm_bc = HaloCommunicationBoundaryCondition(west_comm_ranks)
-    north_comm_bc = HaloCommunicationBoundaryCondition(north_comm_ranks)
-    south_comm_bc = HaloCommunicationBoundaryCondition(south_comm_ranks)
-    top_comm_bc = HaloCommunicationBoundaryCondition(top_comm_ranks)
-    bottom_comm_bc = HaloCommunicationBoundaryCondition(bottom_comm_ranks)
-
-    x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.west : west_comm_bc,
-                                         isnothing(rank_east) ? field_bcs.east : east_comm_bc)
-
-    y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.south : south_comm_bc,
-                                         isnothing(rank_north) ? field_bcs.north : north_comm_bc)
-
-    z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.bottom : bottom_comm_bc,
-                                         isnothing(rank_top) ? field_bcs.top : top_comm_bc)
-
-    return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
-end
-
-#####
-##### MPI tags for halo communication BCs
-#####
-
-sides  = (:west, :east, :south, :north, :top, :bottom)
-
-side_id = Dict(
-    :east => 1, :west => 2,
-    :north => 3, :south => 4,
-    :top => 5, :bottom => 6
-)
-
-opposite_side = Dict(
-    :east => :west, :west => :east,
-    :north => :south, :south => :north,
-    :top => :bottom, :bottom => :top
-)
-
-# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-const MAX_RANKS = 10^3
-RANK_DIGITS = 3
-
-# Define functions that return unique send and recv MPI tags for each side.
-for side in sides
-    side_str = string(side)
-    send_tag_fn_name = Symbol(side, :_send_tag)
-    recv_tag_fn_name = Symbol(side, :_recv_tag)
-    @eval begin
-        function $send_tag_fn_name(bc)
-            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
-            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
-            side_digit = string(side_id[Symbol($side_str)])
-            return parse(Int, from_digits * to_digits * side_digit)
-        end
-
-        function $recv_tag_fn_name(bc)
-            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
-            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
-            side_digit = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, to_digits * from_digits * side_digit)
-        end
-    end
-end
-
-#####
-##### Filling halos for halo communication boundary conditions
-#####
-
-@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
-@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
-@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
-@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
-@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
-@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
-
-@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ny + 2grid.Hy, grid.Nz + 2grid.Hz)
-@inline south_recv_buffer(grid) = zeros(grid.Nx + 2grid.Hx, grid.Hy, grid.Nz + 2grid.Hz)
-@inline top_recv_buffer(grid)   = zeros(grid.Nx + 2grid.Hx, grid.Ny + 2grid.Hy, grid.Hz)
-
-const   east_recv_buffer =  west_recv_buffer
-const  north_recv_buffer = south_recv_buffer
-const bottom_recv_buffer =   top_recv_buffer
-
-@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
-@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
-@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
-@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
-@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
-@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
-
-function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, args...)
-
-    barrier = Event(device(child_architecture(arch)))
-
-    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
-    # north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, args...)
-    # top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
-
-    events = [east_event, west_event] # , north_event, south_event, top_event, bottom_event]
-    events = filter(e -> e isa Event, events)
-    wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
-
-    return nothing
-end
-
-function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, args...)
-    east_event = fill_east_halo!(c, east_bc, child_architecture(arch), barrier, grid, args...)
-    west_event = fill_west_halo!(c, west_bc, child_architecture(arch), barrier, grid, args...)
-    return east_event, west_event
-end
-
-function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
-    # 1 -> send east halo to eastern rank and fill east halo from eastern rank's west halo.
-    # 2 -> send west halo to western rank and fill west halo from western rank's east halo.
-
-    @assert east_bc.condition.from == west_bc.condition.from
-    my_rank = east_bc.condition.from
-
-    rank_to_send_to1 = east_bc.condition.to
-    rank_to_send_to2 = west_bc.condition.to
-
-    send_buffer1 = east_send_buffer(c, grid.Nx, grid.Hx)
-    send_buffer2 = west_send_buffer(c, grid.Nx, grid.Hx)
-
-    send_tag1 = east_send_tag(east_bc)
-    send_tag2 = west_send_tag(west_bc)
-
-    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to1, send_tag=$send_tag1"
-    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to2, send_tag=$send_tag2"
-
-    send_req1 = MPI.Isend(send_buffer1, rank_to_send_to1, send_tag1, MPI.COMM_WORLD)
-    send_req2 = MPI.Isend(send_buffer2, rank_to_send_to2, send_tag2, MPI.COMM_WORLD)
-
-    rank_to_recv_from1 = east_bc.condition.to
-    rank_to_recv_from2 = west_bc.condition.to
-
-    recv_buffer1 = east_recv_buffer(grid)
-    recv_buffer2 = west_recv_buffer(grid)
-
-    recv_tag1 = east_recv_tag(east_bc)
-    recv_tag2 = west_recv_tag(west_bc)
-
-    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from1, recv_tag=$recv_tag1"
-    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from2, recv_tag=$recv_tag2"
-
-    MPI.Recv!(recv_buffer1, rank_to_recv_from1, recv_tag1, MPI.COMM_WORLD)
-    MPI.Recv!(recv_buffer2, rank_to_recv_from2, recv_tag2, MPI.COMM_WORLD)
-
-    @info "Communication done!"
-
-    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer1)
-    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer2)
-
-    # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
-    #       "send_tag=$send_tag, recv_tag=$recv_tag"
-    #
-    # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-    #               recv_buffer, rank_recv_from, recv_tag,
-    #               MPI.COMM_WORLD)
-    #
-    # @info "Sendrecv!: my_rank=$my_rank done!"
-
-    return nothing, nothing
-end
+include("halo_communication_bcs.jl")
+include("halo_communication.jl")
 
 #####
 ##### Distributed model struct and constructor
 #####
 
-# TODO: add the full grid!
-
-struct DistributedModel{A, M}
+struct DistributedModel{A, G, M}
     architecture :: A
+            grid :: G
            model :: M
 end
 
@@ -255,6 +45,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
+    # FIXME: local grid might have different topology!
     my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
 
     ## Change appropriate boundary conditions to halo communication BCs
@@ -288,5 +79,5 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
         model_kwargs...
     )
 
-    return DistributedModel(architecture, my_model)
+    return DistributedModel(architecture, grid, my_model)
 end
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
new file mode 100644
index 0000000000..68454c90dc
--- /dev/null
+++ b/src/Distributed/halo_communication.jl
@@ -0,0 +1,151 @@
+using KernelAbstractions: @kernel, @index, Event, MultiEvent
+
+import Oceananigans.BoundaryConditions:
+    fill_halo_regions!,
+    fill_west_halo!, fill_east_halo!, fill_south_halo!,
+    fill_north_halo!, fill_bottom_halo!, fill_top_halo!
+
+#####
+##### MPI tags for halo communication BCs
+#####
+
+sides  = (:west, :east, :south, :north, :top, :bottom)
+
+side_id = Dict(
+    :east => 1, :west => 2,
+    :north => 3, :south => 4,
+    :top => 5, :bottom => 6
+)
+
+opposite_side = Dict(
+    :east => :west, :west => :east,
+    :north => :south, :south => :north,
+    :top => :bottom, :bottom => :top
+)
+
+# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
+const MAX_RANKS = 10^3
+RANK_DIGITS = 3
+
+# Define functions that return unique send and recv MPI tags for each side.
+for side in sides
+    side_str = string(side)
+    send_tag_fn_name = Symbol(side, :_send_tag)
+    recv_tag_fn_name = Symbol(side, :_recv_tag)
+    @eval begin
+        function $send_tag_fn_name(bc)
+            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
+            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+            side_digit = string(side_id[Symbol($side_str)])
+            return parse(Int, from_digits * to_digits * side_digit)
+        end
+
+        function $recv_tag_fn_name(bc)
+            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
+            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+            side_digit = string(side_id[opposite_side[Symbol($side_str)]])
+            return parse(Int, to_digits * from_digits * side_digit)
+        end
+    end
+end
+
+#####
+##### Filling halos for halo communication boundary conditions
+#####
+
+@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
+@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
+@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
+@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
+@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
+@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
+
+@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ny + 2grid.Hy, grid.Nz + 2grid.Hz)
+@inline south_recv_buffer(grid) = zeros(grid.Nx + 2grid.Hx, grid.Hy, grid.Nz + 2grid.Hz)
+@inline top_recv_buffer(grid)   = zeros(grid.Nx + 2grid.Hx, grid.Ny + 2grid.Hy, grid.Hz)
+
+const   east_recv_buffer =  west_recv_buffer
+const  north_recv_buffer = south_recv_buffer
+const bottom_recv_buffer =   top_recv_buffer
+
+@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
+@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
+@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
+@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
+@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
+@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
+
+function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, args...)
+
+    barrier = Event(device(child_architecture(arch)))
+
+    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+    # north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, args...)
+    # top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+
+    events = [east_event, west_event] # , north_event, south_event, top_event, bottom_event]
+    events = filter(e -> e isa Event, events)
+    wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
+
+    return nothing
+end
+
+function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, args...)
+    east_event = fill_east_halo!(c, east_bc, child_architecture(arch), barrier, grid, args...)
+    west_event = fill_west_halo!(c, west_bc, child_architecture(arch), barrier, grid, args...)
+    return east_event, west_event
+end
+
+function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
+    # 1 -> send east halo to eastern rank and fill east halo from eastern rank's west halo.
+    # 2 -> send west halo to western rank and fill west halo from western rank's east halo.
+
+    @assert east_bc.condition.from == west_bc.condition.from
+    my_rank = east_bc.condition.from
+
+    rank_to_send_to1 = east_bc.condition.to
+    rank_to_send_to2 = west_bc.condition.to
+
+    send_buffer1 = east_send_buffer(c, grid.Nx, grid.Hx)
+    send_buffer2 = west_send_buffer(c, grid.Nx, grid.Hx)
+
+    send_tag1 = east_send_tag(east_bc)
+    send_tag2 = west_send_tag(west_bc)
+
+    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to1, send_tag=$send_tag1"
+    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to2, send_tag=$send_tag2"
+
+    send_req1 = MPI.Isend(send_buffer1, rank_to_send_to1, send_tag1, MPI.COMM_WORLD)
+    send_req2 = MPI.Isend(send_buffer2, rank_to_send_to2, send_tag2, MPI.COMM_WORLD)
+
+    rank_to_recv_from1 = east_bc.condition.to
+    rank_to_recv_from2 = west_bc.condition.to
+
+    recv_buffer1 = east_recv_buffer(grid)
+    recv_buffer2 = west_recv_buffer(grid)
+
+    recv_tag1 = east_recv_tag(east_bc)
+    recv_tag2 = west_recv_tag(west_bc)
+
+    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from1, recv_tag=$recv_tag1"
+    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from2, recv_tag=$recv_tag2"
+
+    MPI.Recv!(recv_buffer1, rank_to_recv_from1, recv_tag1, MPI.COMM_WORLD)
+    MPI.Recv!(recv_buffer2, rank_to_recv_from2, recv_tag2, MPI.COMM_WORLD)
+
+    @info "Communication done!"
+
+    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer1)
+    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer2)
+
+    # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
+    #       "send_tag=$send_tag, recv_tag=$recv_tag"
+    #
+    # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
+    #               recv_buffer, rank_recv_from, recv_tag,
+    #               MPI.COMM_WORLD)
+    #
+    # @info "Sendrecv!: my_rank=$my_rank done!"
+
+    return nothing, nothing
+end
\ No newline at end of file
diff --git a/src/Distributed/halo_communication_bcs.jl b/src/Distributed/halo_communication_bcs.jl
new file mode 100644
index 0000000000..aa21385545
--- /dev/null
+++ b/src/Distributed/halo_communication_bcs.jl
@@ -0,0 +1,54 @@
+using Oceananigans.BoundaryConditions: BCType
+
+import Oceananigans.BoundaryConditions: bctype_str, print_condition
+
+struct HaloCommunication <: BCType end
+
+HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+
+bctype_str(::HaloCommunicationBC) ="HaloCommunication"
+
+HaloCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(HaloCommunication, val; kwargs...)
+
+struct HaloCommunicationRanks{F, T}
+    from :: F
+      to :: T
+end
+
+HaloCommunicationRanks(; from, to) = HaloCommunicationRanks(from, to)
+
+print_condition(hcr::HaloCommunicationRanks) = "(from rank $(hcr.from) to rank $(hcr.to))"
+
+function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
+    rank_east = connectivity.east
+    rank_west = connectivity.west
+    rank_north = connectivity.north
+    rank_south = connectivity.south
+    rank_top = connectivity.top
+    rank_bottom = connectivity.bottom
+
+    east_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_east)
+    west_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_west)
+    north_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_north)
+    south_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_south)
+    top_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_top)
+    bottom_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_bottom)
+
+    east_comm_bc = HaloCommunicationBoundaryCondition(east_comm_ranks)
+    west_comm_bc = HaloCommunicationBoundaryCondition(west_comm_ranks)
+    north_comm_bc = HaloCommunicationBoundaryCondition(north_comm_ranks)
+    south_comm_bc = HaloCommunicationBoundaryCondition(south_comm_ranks)
+    top_comm_bc = HaloCommunicationBoundaryCondition(top_comm_ranks)
+    bottom_comm_bc = HaloCommunicationBoundaryCondition(bottom_comm_ranks)
+
+    x_bcs = CoordinateBoundaryConditions(isnothing(rank_west) ? field_bcs.west : west_comm_bc,
+                                         isnothing(rank_east) ? field_bcs.east : east_comm_bc)
+
+    y_bcs = CoordinateBoundaryConditions(isnothing(rank_south) ? field_bcs.south : south_comm_bc,
+                                         isnothing(rank_north) ? field_bcs.north : north_comm_bc)
+
+    z_bcs = CoordinateBoundaryConditions(isnothing(rank_bottom) ? field_bcs.bottom : bottom_comm_bc,
+                                         isnothing(rank_top) ? field_bcs.top : top_comm_bc)
+
+    return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
+end
\ No newline at end of file

From 493e84e189fe5f53a55464a29cab7222c13f0ec0 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 20:47:09 -0500
Subject: [PATCH 051/100] East/west halo communication passes tests!

---
 src/Distributed/distributed_model.jl      |  6 +++++
 src/Distributed/distributed_utils.jl      | 20 +++++++++++++++
 src/Distributed/test_distributed_model.jl | 30 +++++++++++++++++++++++
 3 files changed, 56 insertions(+)
 create mode 100644 src/Distributed/distributed_utils.jl

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 2a5c06593c..c2d02c16d5 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -3,6 +3,7 @@ using MPI
 using Oceananigans
 using Oceananigans.Grids
 
+include("distributed_utils.jl")
 include("distributed_architectures.jl")
 include("halo_communication_bcs.jl")
 include("halo_communication.jl")
@@ -81,3 +82,8 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
 
     return DistributedModel(architecture, grid, my_model)
 end
+
+function Base.show(io::IO, dm::DistributedModel)
+    print(io, "DistributedModel with ")
+    print(io, dm.architecture)
+end
diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
new file mode 100644
index 0000000000..ecd6879832
--- /dev/null
+++ b/src/Distributed/distributed_utils.jl
@@ -0,0 +1,20 @@
+using Oceananigans.Grids: left_halo_indices, right_halo_indices
+using Oceananigans.Fields: AbstractField
+
+@inline west_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
+
+@inline east_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, right_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
+
+@inline south_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, :, left_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
+
+@inline north_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, :, right_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
+
+@inline bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
+
+@inline bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+    view(f.data, :, :, right_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 1653b284b5..bec9ba4578 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -230,6 +230,31 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     end
 end
 
+#####
+##### Halo communication
+#####
+
+function run_triply_periodic_halo_communication_tests_with_411_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 1), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        set!(field, arch.my_rank)
+        fill_halo_regions!(field, arch)
+
+        @test all(east_halo(u) .== arch.connectivity.east)
+        @test all(west_halo(u) .== arch.connectivity.west)
+    end
+
+    return nothing
+end
+
+#####
+##### Run tests!
+#####
+
 @testset "Distributed MPI Oceananigans" begin
     @info "Testing distributed MPI Oceananigans..."
 
@@ -254,6 +279,11 @@ end
         run_triply_periodic_bc_injection_tests_with_114_ranks()
     end
 
+    @testset "Halo communication" begin
+        @info "  Testing halo communication..."
+        run_triply_periodic_halo_communication_tests_with_411_ranks()
+    end
+
     # TODO: 221 ranks
     # TODO: triply bounded
 end

From e117b1384db2ec776e73e16b06a6301207f5cfb8 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 21:25:36 -0500
Subject: [PATCH 052/100] More modular halo communication

---
 src/Distributed/halo_communication.jl     | 94 ++++++++++++-----------
 src/Distributed/test_distributed_model.jl |  5 +-
 2 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 68454c90dc..521d02387a 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -28,23 +28,28 @@ const MAX_RANKS = 10^3
 RANK_DIGITS = 3
 
 # Define functions that return unique send and recv MPI tags for each side.
+# It's an integer where
+#   digit 1: the side
+#   digits 2-4: the from rank
+#   digits 5-7: the to rank
+
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol(side, :_send_tag)
     recv_tag_fn_name = Symbol(side, :_recv_tag)
     @eval begin
-        function $send_tag_fn_name(bc)
-            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
-            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+        function $send_tag_fn_name(my_rank, rank_to_send_to)
+            from_digits = string(my_rank, pad=RANK_DIGITS)
+            to_digits = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit = string(side_id[Symbol($side_str)])
             return parse(Int, from_digits * to_digits * side_digit)
         end
 
-        function $recv_tag_fn_name(bc)
-            from_digits = string(bc.condition.from, pad=RANK_DIGITS)
-            to_digits = string(bc.condition.to, pad=RANK_DIGITS)
+        function $recv_tag_fn_name(my_rank, rank_to_recv_from)
+            from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
+            to_digits = string(my_rank, pad=RANK_DIGITS)
             side_digit = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, to_digits * from_digits * side_digit)
+            return parse(Int, from_digits * to_digits * side_digit)
         end
     end
 end
@@ -96,56 +101,57 @@ function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, arg
     return east_event, west_event
 end
 
-function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
-    # 1 -> send east halo to eastern rank and fill east halo from eastern rank's west halo.
-    # 2 -> send west halo to western rank and fill west halo from western rank's east halo.
+function send_east_halo(c, grid, my_rank, rank_to_send_to)
+    send_buffer = east_send_buffer(c, grid.Nx, grid.Hx)
+    send_tag = east_send_tag(my_rank, rank_to_send_to)
 
-    @assert east_bc.condition.from == west_bc.condition.from
-    my_rank = east_bc.condition.from
+    @debug "Sending east halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
+    status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
 
-    rank_to_send_to1 = east_bc.condition.to
-    rank_to_send_to2 = west_bc.condition.to
+    return status
+end
+
+function send_west_halo(c, grid, my_rank, rank_to_send_to)
+    send_buffer = west_send_buffer(c, grid.Nx, grid.Hx)
+    send_tag = west_send_tag(my_rank, rank_to_send_to)
 
-    send_buffer1 = east_send_buffer(c, grid.Nx, grid.Hx)
-    send_buffer2 = west_send_buffer(c, grid.Nx, grid.Hx)
+    @debug "Sending west halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
+    status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
 
-    send_tag1 = east_send_tag(east_bc)
-    send_tag2 = west_send_tag(west_bc)
+    return status
+end
 
-    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to1, send_tag=$send_tag1"
-    @info "MPI.Isend: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to2, send_tag=$send_tag2"
+function recv_and_fill_east_halo!(c, grid, my_rank, rank_to_recv_from)
+    recv_buffer = east_recv_buffer(grid)
+    recv_tag = east_recv_tag(my_rank, rank_to_recv_from)
 
-    send_req1 = MPI.Isend(send_buffer1, rank_to_send_to1, send_tag1, MPI.COMM_WORLD)
-    send_req2 = MPI.Isend(send_buffer2, rank_to_send_to2, send_tag2, MPI.COMM_WORLD)
+    @debug "Receiving east halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
+    MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
-    rank_to_recv_from1 = east_bc.condition.to
-    rank_to_recv_from2 = west_bc.condition.to
+    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer)
 
-    recv_buffer1 = east_recv_buffer(grid)
-    recv_buffer2 = west_recv_buffer(grid)
+    return nothing
+end
 
-    recv_tag1 = east_recv_tag(east_bc)
-    recv_tag2 = west_recv_tag(west_bc)
+function recv_and_fill_west_halo!(c, grid, my_rank, rank_to_recv_from)
+    recv_buffer = west_recv_buffer(grid)
+    recv_tag = west_recv_tag(my_rank, rank_to_recv_from)
 
-    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from1, recv_tag=$recv_tag1"
-    @info "MPI.Recv!: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from2, recv_tag=$recv_tag2"
+    @debug "Receiving west halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
+    MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
-    MPI.Recv!(recv_buffer1, rank_to_recv_from1, recv_tag1, MPI.COMM_WORLD)
-    MPI.Recv!(recv_buffer2, rank_to_recv_from2, recv_tag2, MPI.COMM_WORLD)
+    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer)
 
-    @info "Communication done!"
+    return nothing
+end
 
-    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer1)
-    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer2)
+function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
+    my_rank = east_bc.condition.from
+    send_east_halo(c, grid, my_rank, east_bc.condition.to)
+    send_west_halo(c, grid, my_rank, west_bc.condition.to)
 
-    # @info "Sendrecv!: my_rank=$my_rank, rank_send_to=rank_recv_from=$rank_send_to, " *
-    #       "send_tag=$send_tag, recv_tag=$recv_tag"
-    #
-    # MPI.Sendrecv!(send_buffer, rank_send_to,   send_tag,
-    #               recv_buffer, rank_recv_from, recv_tag,
-    #               MPI.COMM_WORLD)
-    #
-    # @info "Sendrecv!: my_rank=$my_rank done!"
+    recv_and_fill_east_halo!(c, grid, my_rank, east_bc.condition.to)
+    recv_and_fill_west_halo!(c, grid, my_rank, west_bc.condition.to)
 
     return nothing, nothing
-end
\ No newline at end of file
+end
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index bec9ba4578..77e4d22ae1 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -244,8 +244,8 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
         set!(field, arch.my_rank)
         fill_halo_regions!(field, arch)
 
-        @test all(east_halo(u) .== arch.connectivity.east)
-        @test all(west_halo(u) .== arch.connectivity.west)
+        @test all(east_halo(field) .== arch.connectivity.east)
+        @test all(west_halo(field) .== arch.connectivity.west)
     end
 
     return nothing
@@ -279,6 +279,7 @@ end
         run_triply_periodic_bc_injection_tests_with_114_ranks()
     end
 
+    # TODO: Larger halos!
     @testset "Halo communication" begin
         @info "  Testing halo communication..."
         run_triply_periodic_halo_communication_tests_with_411_ranks()

From 3cd719ce1c34142b7d3438e31c2efdef314a6258 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 21:53:07 -0500
Subject: [PATCH 053/100] Send and receive views to avoid memory allocations

---
 src/Distributed/distributed_utils.jl  | 32 +++++++++++---
 src/Distributed/halo_communication.jl | 61 +++++++++------------------
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index ecd6879832..dfd6dc0fbc 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -1,20 +1,38 @@
-using Oceananigans.Grids: left_halo_indices, right_halo_indices
+using Oceananigans.Grids: left_halo_indices, right_halo_indices, underlying_left_halo_indices, underlying_right_halo_indices
 using Oceananigans.Fields: AbstractField
 
-@inline west_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+west_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
 
-@inline east_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+east_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, right_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
 
-@inline south_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+south_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, left_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
 
-@inline north_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+north_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, right_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
 
-@inline bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
 
-@inline bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, :, right_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
+
+underlying_west_halo(f, grid, location) =
+    view(f.parent, underlying_left_halo_indices(location, topology(grid, 1), grid.Nx, grid.Hx), :, :)
+
+underlying_east_halo(f, grid, location) =
+    view(f.parent, underlying_right_halo_indices(location, topology(grid, 1), grid.Nx, grid.Hx), :, :)
+
+underlying_south_halo(f, grid, location) =
+    view(f.parent, :, underlying_left_halo_indices(location, topology(grid, 2), grid.Ny, grid.Hy), :)
+
+underlying_north_halo(f, grid, location) =
+    view(f.parent, :, underlying_right_halo_indices(location, topology(grid, 2), grid.Nz, grid.Hz), :)
+
+underlying_bottom_halo(f, grid, location) =
+    view(f.parent, :, :, underlying_left_halo_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
+
+underlying_top_halo(f, grid, location) =
+    view(f.parent, :, :, underlying_right_halo_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 521d02387a..e773fa9356 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -58,33 +58,14 @@ end
 ##### Filling halos for halo communication boundary conditions
 #####
 
-@inline   west_send_buffer(c, N, H) = c.parent[N+1:N+H, :, :]
-@inline   east_send_buffer(c, N, H) = c.parent[1+H:2H,  :, :]
-@inline  south_send_buffer(c, N, H) = c.parent[:, N+1:N+H, :]
-@inline  north_send_buffer(c, N, H) = c.parent[:, 1+H:2H,  :]
-@inline    top_send_buffer(c, N, H) = c.parent[:, :,  1+H:2H]
-@inline bottom_send_buffer(c, N, H) = c.parent[:, :, N+1:N+H]
-
-@inline west_recv_buffer(grid)  = zeros(grid.Hx, grid.Ny + 2grid.Hy, grid.Nz + 2grid.Hz)
-@inline south_recv_buffer(grid) = zeros(grid.Nx + 2grid.Hx, grid.Hy, grid.Nz + 2grid.Hz)
-@inline top_recv_buffer(grid)   = zeros(grid.Nx + 2grid.Hx, grid.Ny + 2grid.Hy, grid.Hz)
-
-const   east_recv_buffer =  west_recv_buffer
-const  north_recv_buffer = south_recv_buffer
-const bottom_recv_buffer =   top_recv_buffer
-
-@inline   copy_recv_buffer_into_west_halo!(c, N, H, buf) = (c.parent[    1:H,    :, :] .= buf)
-@inline   copy_recv_buffer_into_east_halo!(c, N, H, buf) = (c.parent[N+H+1:N+2H, :, :] .= buf)
-@inline  copy_recv_buffer_into_south_halo!(c, N, H, buf) = (c.parent[:,     1:H,    :] .= buf)
-@inline  copy_recv_buffer_into_north_halo!(c, N, H, buf) = (c.parent[:, N+H+1:N+2H, :] .= buf)
-@inline copy_recv_buffer_into_bottom_halo!(c, N, H, buf) = (c.parent[:, :,     1:H   ] .= buf)
-@inline    copy_recv_buffer_into_top_halo!(c, N, H, buf) = (c.parent[:, :, N+H+1:N+2H] .= buf)
-
-function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, args...)
+fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch::AbstractMultiArchitecture, args...) where {LX, LY, LZ} =
+    fill_halo_regions!(field.data, field.boundary_conditions, arch, field.grid, (LX, LY, LZ), args...)
+
+function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, location, args...)
 
     barrier = Event(device(child_architecture(arch)))
 
-    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, location, args...)
     # north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, args...)
     # top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
 
@@ -95,14 +76,14 @@ function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitectu
     return nothing
 end
 
-function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, args...)
+function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, location, args...)
     east_event = fill_east_halo!(c, east_bc, child_architecture(arch), barrier, grid, args...)
     west_event = fill_west_halo!(c, west_bc, child_architecture(arch), barrier, grid, args...)
     return east_event, west_event
 end
 
-function send_east_halo(c, grid, my_rank, rank_to_send_to)
-    send_buffer = east_send_buffer(c, grid.Nx, grid.Hx)
+function send_east_halo(c, grid, c_location, my_rank, rank_to_send_to)
+    send_buffer = underlying_east_halo(c, grid, c_location)
     send_tag = east_send_tag(my_rank, rank_to_send_to)
 
     @debug "Sending east halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
@@ -111,8 +92,8 @@ function send_east_halo(c, grid, my_rank, rank_to_send_to)
     return status
 end
 
-function send_west_halo(c, grid, my_rank, rank_to_send_to)
-    send_buffer = west_send_buffer(c, grid.Nx, grid.Hx)
+function send_west_halo(c, grid, c_location, my_rank, rank_to_send_to)
+    send_buffer = underlying_west_halo(c, grid, c_location)
     send_tag = west_send_tag(my_rank, rank_to_send_to)
 
     @debug "Sending west halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
@@ -121,37 +102,33 @@ function send_west_halo(c, grid, my_rank, rank_to_send_to)
     return status
 end
 
-function recv_and_fill_east_halo!(c, grid, my_rank, rank_to_recv_from)
-    recv_buffer = east_recv_buffer(grid)
+function recv_and_fill_east_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
+    recv_buffer = underlying_east_halo(c, grid, c_location)
     recv_tag = east_recv_tag(my_rank, rank_to_recv_from)
 
     @debug "Receiving east halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
     MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
-    copy_recv_buffer_into_east_halo!(c, grid.Nx, grid.Hx, recv_buffer)
-
     return nothing
 end
 
-function recv_and_fill_west_halo!(c, grid, my_rank, rank_to_recv_from)
-    recv_buffer = west_recv_buffer(grid)
+function recv_and_fill_west_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
+    recv_buffer = underlying_west_halo(c, grid, c_location)
     recv_tag = west_recv_tag(my_rank, rank_to_recv_from)
 
     @debug "Receiving west halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
     MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
-    copy_recv_buffer_into_west_halo!(c, grid.Nx, grid.Hx, recv_buffer)
-
     return nothing
 end
 
-function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, args...)
+function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, c_location, args...)
     my_rank = east_bc.condition.from
-    send_east_halo(c, grid, my_rank, east_bc.condition.to)
-    send_west_halo(c, grid, my_rank, west_bc.condition.to)
+    send_east_halo(c, grid, c_location, my_rank, east_bc.condition.to)
+    send_west_halo(c, grid, c_location, my_rank, west_bc.condition.to)
 
-    recv_and_fill_east_halo!(c, grid, my_rank, east_bc.condition.to)
-    recv_and_fill_west_halo!(c, grid, my_rank, west_bc.condition.to)
+    recv_and_fill_east_halo!(c, grid, c_location, my_rank, east_bc.condition.to)
+    recv_and_fill_west_halo!(c, grid, c_location, my_rank, west_bc.condition.to)
 
     return nothing, nothing
 end

From 8fcfe01ee832ed7bc408d73030aaefe9f3c39fd0 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 22:47:01 -0500
Subject: [PATCH 054/100] Nuke super ancient sandbox

---
 sandbox/tiled_halos.jl     |  70 ------------
 sandbox/tiled_halos_mpi.jl | 215 -------------------------------------
 2 files changed, 285 deletions(-)
 delete mode 100644 sandbox/tiled_halos.jl
 delete mode 100644 sandbox/tiled_halos_mpi.jl

diff --git a/sandbox/tiled_halos.jl b/sandbox/tiled_halos.jl
deleted file mode 100644
index e2f90babb4..0000000000
--- a/sandbox/tiled_halos.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-using Oceananigans, Test
-
-@inline incmod1(a, n) = ifelse(a==n, 1, a + 1)
-@inline decmod1(a, n) = ifelse(a==1, n, a - 1)
-@inline index2rank(I, J, Mx, My) = J*My + I
-
-@inline north_halo(tile) = @views @inbounds tile.data[1-tile.grid.Hx:0, :, :]
-@inline south_halo(tile) = @views @inbounds tile.data[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
-@inline  west_halo(tile) = @views @inbounds tile.data[:, 1-tile.grid.Hy:0, :]
-@inline  east_halo(tile) = @views @inbounds tile.data[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
-
-@inline north_data(tile) = @views @inbounds tile.data[1:tile.grid.Hx, :, :]
-@inline south_data(tile) = @views @inbounds tile.data[tile.grid.Nx-tile.grid.Hx+1:tile.grid.Nx, :, :]
-@inline  west_data(tile) = @views @inbounds tile.data[:, 1:tile.grid.Hy, :]
-@inline  east_data(tile) = @views @inbounds tile.data[:, tile.grid.Ny-tile.grid.Hy+1:tile.grid.Ny, :]
-
-function fill_halo_regions_tiled!(tiles, Mx, My)
-    for J in 0:My-1, I in 0:Mx-1
-        rank = index2rank(I, J, Mx, My)
-
-        I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
-        J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
-
-        north_rank = index2rank(I,  J⁻, Mx, My)
-        south_rank = index2rank(I,  J⁺, Mx, My)
-        east_rank  = index2rank(I⁺, J,  Mx, My)
-        west_rank  = index2rank(I⁻, J,  Mx, My)
-
-         east_halo(tiles[rank+1]) .=  west_data(tiles[east_rank+1])
-         west_halo(tiles[rank+1]) .=  east_data(tiles[west_rank+1])
-        north_halo(tiles[rank+1]) .= south_data(tiles[north_rank+1])
-        south_halo(tiles[rank+1]) .= north_data(tiles[south_rank+1])
-    end
-end
-
-FT, arch = Float64, CPU()
-
-Nx, Ny, Nz = 16, 16, 16
-Lx, Ly, Lz = 10, 10, 10
-N, L = (Nx, Ny, Nz), (Lx, Ly, Lz)
-
-grid = RegularCartesianGrid(N, L)
-
-# MPI ranks along each dimension
-Mx, My = 2, 2
-
-R = rand(Nx, Ny, Nz)
-
-tiles = []
-for I in 0:Mx-1, J in 0:My-1
-    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
-    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
-    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
-
-    tile = CellField(FT, arch, tile_grid)
-
-    i1, i2 = I*Nx′+1, (I+1)*Nx′
-    j1, j2 = J*Ny′+1, (J+1)*Ny′
-    data(tile) .= R[i1:i2, j1:j2, :]
-
-    push!(tiles, tile)
-end
-
-fill_halo_regions_tiled!(tiles, Mx, My)
-fill_halo_regions_tiled!(tiles, Mx, My)
-
-@test all(tiles[1].data[1:end,     1:end, :] .== R[1:9,   1:9,   :])
-@test all(tiles[2].data[1:end,   0:end-1, :] .== R[1:9,   8:end, :])
-@test all(tiles[3].data[0:end-1,   1:end, :] .== R[8:end, 1:9,   :])
-@test all(tiles[4].data[0:end-1, 0:end-1, :] .== R[8:end, 8:end, :])
diff --git a/sandbox/tiled_halos_mpi.jl b/sandbox/tiled_halos_mpi.jl
deleted file mode 100644
index 4a01425482..0000000000
--- a/sandbox/tiled_halos_mpi.jl
+++ /dev/null
@@ -1,215 +0,0 @@
-using Printf
-
-using CuArrays
-import MPI
-
-using Oceananigans
-
-# Source: https://github.com/JuliaCI/BenchmarkTools.jl/blob/master/src/trials.jl
-function prettytime(t)
-    if t < 1e3
-        value, units = t, "ns"
-    elseif t < 1e6
-        value, units = t / 1e3, "μs"
-    elseif t < 1e9
-        value, units = t / 1e6, "ms"
-    else
-        s = t / 1e9
-        if s < 60
-            value, units = s, "s"
-        else
-            value, units = (s / 60), "min"
-        end
-    end
-    return string(@sprintf("%.3f", value), " ", units)
-end
-
-function prettybandwidth(b)
-    if b < 1024
-        val, units = b, "B/s"
-    elseif b < 1024^2
-        val, units = b / 1024, "KiB/s"
-    elseif b < 1024^3
-        val, units = b / 1024^2, "MiB/s"
-    else
-        val, units = b / 1024^3, "GiB/s"
-    end
-    return string(@sprintf("%.3f", val), " ", units)
-end
-
-@inline index2rank(I, J, Mx, My) = J*My + I
-@inline rank2index(r, Mx, My) = mod(r, Mx), div(r, My)
-
-@inline north_halo(tile) = @views @inbounds tile.data.parent[1:tile.grid.Hx, :, :]
-@inline south_halo(tile) = @views @inbounds tile.data.parent[tile.grid.Nx+tile.grid.Hx+1:tile.grid.Nx+2tile.grid.Hx, :, :]
-@inline  west_halo(tile) = @views @inbounds tile.data.parent[:, 1:tile.grid.Hy, :]
-@inline  east_halo(tile) = @views @inbounds tile.data.parent[:, tile.grid.Ny+tile.grid.Hy+1:tile.grid.Ny+2tile.grid.Hy, :]
-
-@inline north_data(tile) = @views @inbounds tile.data.parent[1+tile.grid.Hx:2tile.grid.Hx,   :, :]
-@inline south_data(tile) = @views @inbounds tile.data.parent[tile.grid.Nx+1:tile.grid.Nx+tile.grid.Hx, :, :]
-@inline  west_data(tile) = @views @inbounds tile.data.parent[:, 1+tile.grid.Hy:2tile.grid.Hy,   :]
-@inline  east_data(tile) = @views @inbounds tile.data.parent[:, tile.grid.Ny+1:tile.grid.Ny+tile.grid.Hy, :]
-
-@inline distribute_tag(rank) = 100 + rank
-@inline  send_west_tag(rank) = 200 + rank
-@inline  send_east_tag(rank) = 300 + rank
-@inline send_north_tag(rank) = 400 + rank
-@inline send_south_tag(rank) = 500 + rank
-
-function send_halo_data(tile, Mx, My, comm)
-    rank = MPI.Comm_rank(comm)
-
-    I, J = rank2index(rank, Mx, My)
-    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
-    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
-
-    north_rank = index2rank(I,  J⁻, Mx, My)
-    south_rank = index2rank(I,  J⁺, Mx, My)
-    east_rank  = index2rank(I⁺, J,  Mx, My)
-    west_rank  = index2rank(I⁻, J,  Mx, My)
-
-    # cuzeros doesn't exist anymore. Use similar!
-    west_data_buf = zeros(size(west_data(tile)))
-    east_data_buf = zeros(size(east_data(tile)))
-   north_data_buf = zeros(size(north_data(tile)))
-   south_data_buf = zeros(size(south_data(tile)))
-
-    west_data_buf .= copy(west_data(tile))
-    east_data_buf .= copy(east_data(tile))
-   north_data_buf .= copy(north_data(tile))
-   south_data_buf .= copy(south_data(tile))
-
-   se_req = MPI.Isend(east_data_buf,  east_rank,  send_east_tag(rank),  comm)
-   sw_req = MPI.Isend(west_data_buf,  west_rank,  send_west_tag(rank),  comm)
-   sn_req = MPI.Isend(north_data_buf, north_rank, send_north_tag(rank), comm)
-   ss_req = MPI.Isend(south_data_buf, south_rank, send_south_tag(rank), comm)
-
-   @debug "[rank $rank] sending #$(send_east_tag(rank)) to rank $east_rank"
-   @debug "[rank $rank] sending #$(send_west_tag(rank)) to rank $west_rank"
-   @debug "[rank $rank] sending #$(send_north_tag(rank)) to rank $north_rank"
-   @debug "[rank $rank] sending #$(send_south_tag(rank)) to rank $south_rank"
-end
-
-function receive_halo_data(tile, Mx, My, comm)
-    rank = MPI.Comm_rank(comm)
-
-    I, J = rank2index(rank, Mx, My)
-    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
-    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
-
-    north_rank = index2rank(I,  J⁻, Mx, My)
-    south_rank = index2rank(I,  J⁺, Mx, My)
-    east_rank  = index2rank(I⁺, J,  Mx, My)
-    west_rank  = index2rank(I⁻, J,  Mx, My)
-    
-    west_halo_buf = zeros(size(west_halo(tile)))
-    east_halo_buf = zeros(size(east_halo(tile)))
-   north_halo_buf = zeros(size(north_halo(tile)))
-   south_halo_buf = zeros(size(south_halo(tile)))
-
-   re_req = MPI.Irecv!(west_halo_buf,  west_rank,  send_east_tag(west_rank),  comm)
-   rw_req = MPI.Irecv!(east_halo_buf,  east_rank,  send_west_tag(east_rank),  comm)
-   rn_req = MPI.Irecv!(south_halo_buf, south_rank, send_north_tag(south_rank), comm)
-   rs_req = MPI.Irecv!(north_halo_buf, north_rank, send_south_tag(north_rank), comm)
-
-   @debug "[rank $rank] waiting for #$(send_east_tag(west_rank)) from rank $west_rank..."
-   @debug "[rank $rank] waiting for #$(send_west_tag(east_rank)) from rank $east_rank..."
-   @debug "[rank $rank] waiting for #$(send_north_tag(south_rank)) from rank $south_rank..."
-   @debug "[rank $rank] waiting for #$(send_south_tag(north_rank)) from rank $north_rank..."
-
-   MPI.Waitall!([re_req, rw_req, rn_req, rs_req])
-
-     east_halo(tile) .= CuArray(east_halo_buf)
-     west_halo(tile) .= CuArray(west_halo_buf)
-    north_halo(tile) .= CuArray(north_halo_buf)
-    south_halo(tile) .= CuArray(south_halo_buf)
-end
-
-function fill_halo_regions_mpi!(FT, arch, Nx, Ny, Nz, Mx, My)
-    Lx, Ly, Lz = 10, 10, 10
-
-    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
-    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
-
-    comm = MPI.COMM_WORLD
-    
-    MPI.Barrier(comm)
-
-    rank = MPI.Comm_rank(comm)
-       R = MPI.Comm_size(comm)
-
-    I, J = rank2index(rank, Mx, My)
-    I⁻, I⁺ = mod(I-1, Mx), mod(I+1, Mx)
-    J⁻, J⁺ = mod(J-1, My), mod(J+1, My)
-    Nx′, Ny′, Nz′ = Int(Nx/Mx), Int(Ny/My), Nz
-    Lx′, Ly′, Lz′ = Lx/Mx, Ly/My, Lz
-
-    north_rank = index2rank(I,  J⁻, Mx, My)
-    south_rank = index2rank(I,  J⁺, Mx, My)
-    east_rank  = index2rank(I⁺, J,  Mx, My)
-    west_rank  = index2rank(I⁻, J,  Mx, My)
-
-    tile_grid = RegularCartesianGrid((Nx′, Ny′, Nz′), (Lx′, Ly′, Lz′))
-    tile = CellField(FT, arch, tile_grid)
-    
-    send_reqs = MPI.Request[]
-    if rank == 0
-        rands = rand(Nx, Ny, Nz)
-
-        for r in 1:Mx*My-1
-            I′, J′ = rank2index(r, Mx, My)
-            i1, i2 = I′*Nx′+1, (I′+1)*Nx′
-            j1, j2 = J′*Ny′+1, (J′+1)*Ny′
-            send_mesg = rands[i1:i2, j1:j2, :]
-
-            println("[rank $rank] Sending rands[$i1:$i2, $j1:$j2, :] to rank $r...")
-            sreq = MPI.Isend(send_mesg, r, distribute_tag(r), comm)
-            push!(send_reqs, sreq)
-        end
-
-        data(tile) .= rands[1:Nx′, 1:Ny′, :]
-
-        MPI.Waitall!(send_reqs)
-    end
-
-    if rank != 0
-        println("[rank $rank] Receiving tile from rank 0...")
-        recv_mesg = zeros(FT, Nx′, Ny′, Nz′)
-        rreq = MPI.Irecv!(recv_mesg, 0, distribute_tag(rank), comm)
-
-        stats = MPI.Wait!(rreq)
-        data(tile) .= recv_mesg
-    end
-    
-    println("[rank $rank] Sending halo data...")
-    send_halo_data(tile, Mx, My, comm)
-
-    println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile, Mx, My, comm)
-    
-    MPI.Barrier(comm)
-    if rank == 0
-        tic = time_ns() 
-    end
-
-    println("[rank $rank] Sending halo data...")
-    send_halo_data(tile, Mx, My, comm)
-
-    println("[rank $rank] Receiving halo data...")
-    receive_halo_data(tile, Mx, My, comm)
-
-	MPI.Barrier(comm)
-	if rank == 0
-		t = (time_ns() - tic)
-		ts = t / 1e9
-        @info "$R ranks halo communication time: $(prettytime(t))"
-        
-        Hx, Hy = 1, 1
-        data_size = sizeof(FT) * 2Nz*(Hx*Nx + Hy*Ny)
-        @info "$R ranks halo communication bandwidth: $(prettybandwidth(data_size/ts))"
-	end
-end
-
-MPI.Init()
-fill_halo_regions_mpi!(Float64, GPU(), 192, 192, 192, 3, 3)
-MPI.Finalize()

From f816d546101b00c2ba09541b4b35d26f3e628d9b Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 22:47:49 -0500
Subject: [PATCH 055/100] Beautiful metaprogramming for halo communication

---
 src/Distributed/distributed_utils.jl      |   2 +-
 src/Distributed/halo_communication.jl     | 130 ++++++++++++++--------
 src/Distributed/test_distributed_model.jl |  38 ++++++-
 3 files changed, 122 insertions(+), 48 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index dfd6dc0fbc..117ae74f3c 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -16,7 +16,7 @@ north_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
 bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
 
-bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
+top_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, :, right_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
 
 underlying_west_halo(f, grid, location) =
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index e773fa9356..2dc48f4857 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -24,7 +24,7 @@ opposite_side = Dict(
 )
 
 # Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-const MAX_RANKS = 10^3
+MAX_RANKS = 10^3
 RANK_DIGITS = 3
 
 # Define functions that return unique send and recv MPI tags for each side.
@@ -35,8 +35,8 @@ RANK_DIGITS = 3
 
 for side in sides
     side_str = string(side)
-    send_tag_fn_name = Symbol(side, :_send_tag)
-    recv_tag_fn_name = Symbol(side, :_recv_tag)
+    send_tag_fn_name = Symbol("$(side)_send_tag")
+    recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
         function $send_tag_fn_name(my_rank, rank_to_send_to)
             from_digits = string(my_rank, pad=RANK_DIGITS)
@@ -61,74 +61,112 @@ end
 fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch::AbstractMultiArchitecture, args...) where {LX, LY, LZ} =
     fill_halo_regions!(field.data, field.boundary_conditions, arch, field.grid, (LX, LY, LZ), args...)
 
-function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, location, args...)
+function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, c_location, args...)
 
     barrier = Event(device(child_architecture(arch)))
 
-    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, location, args...)
-    # north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, args...)
-    # top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.east, bcs.west, arch, barrier, grid, args...)
+    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, c_location, args...)
+    north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, c_location, args...)
+    top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.top, bcs.bottom, arch, barrier, grid, c_location, args...)
 
-    events = [east_event, west_event] # , north_event, south_event, top_event, bottom_event]
+    events = [east_event, west_event, north_event, south_event, top_event, bottom_event]
     events = filter(e -> e isa Event, events)
     wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
 
     return nothing
 end
 
-function fill_east_and_west_halos!(c, east_bc, west_bc, arch, barrier, grid, location, args...)
-    east_event = fill_east_halo!(c, east_bc, child_architecture(arch), barrier, grid, args...)
-    west_event = fill_west_halo!(c, west_bc, child_architecture(arch), barrier, grid, args...)
-    return east_event, west_event
-end
-
-function send_east_halo(c, grid, c_location, my_rank, rank_to_send_to)
-    send_buffer = underlying_east_halo(c, grid, c_location)
-    send_tag = east_send_tag(my_rank, rank_to_send_to)
+#####
+##### fill_east_and_west_halos!   }
+##### fill_north_and_south_halos! } for non-communicating boundary conditions (fallback)
+##### fill_top_and_bottom_halos!  }
+#####
 
-    @debug "Sending east halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
-    status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
+for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom])
+    fill_both_halos! = Symbol("fill_$(side)_and_$(opposite_side)_halos!")
+    fill_side_halo! = Symbol("fill_$(side)_halo!")
+    fill_opposite_side_halo! = Symbol("fill_$(opposite_side)_halo!")
 
-    return status
+    @eval begin
+        function $fill_both_halos!(c, bc_side, bc_opposite_side, arch, barrier, grid, args...)
+            event_side = $fill_side_halo!(c, bc_side, child_architecture(arch), barrier, grid, args...)
+            event_opposite_side = $fill_opposite_side_halo!(c, bc_opposite_side, child_architecture(arch), barrier, grid, args...)
+            return event_side, event_opposite_side
+        end
+    end
 end
 
-function send_west_halo(c, grid, c_location, my_rank, rank_to_send_to)
-    send_buffer = underlying_west_halo(c, grid, c_location)
-    send_tag = west_send_tag(my_rank, rank_to_send_to)
+#####
+##### fill_east_and_west_halos!   }
+##### fill_north_and_south_halos! } for when both halos are communicative
+##### fill_top_and_bottom_halos!  }
+#####
 
-    @debug "Sending west halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
-    status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
+for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom])
+    fill_both_halos! = Symbol("fill_$(side)_and_$(opposite_side)_halos!")
+    send_side_halo = Symbol("send_$(side)_halo")
+    send_opposite_side_halo = Symbol("send_$(opposite_side)_halo")
+    recv_and_fill_side_halo! = Symbol("recv_and_fill_$(side)_halo!")
+    recv_and_fill_opposite_side_halo! = Symbol("recv_and_fill_$(opposite_side)_halo!")
 
-    return status
-end
+    @eval begin
+        function $fill_both_halos!(c, bc_side::HaloCommunicationBC, bc_opposite_side::HaloCommunicationBC, arch, barrier, grid, c_location, args...)
+            @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
+            my_rank = bc_side.condition.from
 
-function recv_and_fill_east_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
-    recv_buffer = underlying_east_halo(c, grid, c_location)
-    recv_tag = east_recv_tag(my_rank, rank_to_recv_from)
+            $send_side_halo(c, grid, c_location, my_rank, bc_side.condition.to)
+            $send_opposite_side_halo(c, grid, c_location, my_rank, bc_opposite_side.condition.to)
 
-    @debug "Receiving east halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
-    MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
+            $recv_and_fill_side_halo!(c, grid, c_location, my_rank, bc_side.condition.to)
+            $recv_and_fill_opposite_side_halo!(c, grid, c_location, my_rank, bc_opposite_side.condition.to)
 
-    return nothing
+            return nothing, nothing
+        end
+    end
 end
 
-function recv_and_fill_west_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
-    recv_buffer = underlying_west_halo(c, grid, c_location)
-    recv_tag = west_recv_tag(my_rank, rank_to_recv_from)
+#####
+##### Sending halos
+#####
 
-    @debug "Receiving west halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
-    MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
+for side in sides
+    side_str = string(side)
+    send_side_halo = Symbol("send_$(side)_halo")
+    underlying_side_halo = Symbol("underlying_$(side)_halo")
+    side_send_tag = Symbol("$(side)_send_tag")
 
-    return nothing
+    @eval begin
+        function $send_side_halo(c, grid, c_location, my_rank, rank_to_send_to)
+            send_buffer = $underlying_side_halo(c, grid, c_location)
+            send_tag = $side_send_tag(my_rank, rank_to_send_to)
+
+            @debug "Sending " * $side_str * " halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
+            status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
+
+            return status
+        end
+    end
 end
 
-function fill_east_and_west_halos!(c, east_bc::HaloCommunicationBC, west_bc::HaloCommunicationBC, arch, barrier, grid, c_location, args...)
-    my_rank = east_bc.condition.from
-    send_east_halo(c, grid, c_location, my_rank, east_bc.condition.to)
-    send_west_halo(c, grid, c_location, my_rank, west_bc.condition.to)
+#####
+##### Receiving and filling halos (buffer is a view so should get filled upon receive)
+#####
+
+for side in sides
+    side_str = string(side)
+    recv_and_fill_side_halo! = Symbol("recv_and_fill_$(side)_halo!")
+    underlying_side_halo = Symbol("underlying_$(side)_halo")
+    side_recv_tag = Symbol("$(side)_recv_tag")
+
+    @eval begin
+        function $recv_and_fill_side_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
+            recv_buffer = $underlying_side_halo(c, grid, c_location)
+            recv_tag = $side_recv_tag(my_rank, rank_to_recv_from)
 
-    recv_and_fill_east_halo!(c, grid, c_location, my_rank, east_bc.condition.to)
-    recv_and_fill_west_halo!(c, grid, c_location, my_rank, west_bc.condition.to)
+            @debug "Receiving " * $side_str * " halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
+            MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
-    return nothing, nothing
+            return nothing
+        end
+    end
 end
diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_model.jl
index 77e4d22ae1..0121b32d61 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_model.jl
@@ -236,7 +236,7 @@ end
 
 function run_triply_periodic_halo_communication_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 1), extent=(1, 2, 3))
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
@@ -251,6 +251,40 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     return nothing
 end
 
+function run_triply_periodic_halo_communication_tests_with_141_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        set!(field, arch.my_rank)
+        fill_halo_regions!(field, arch)
+
+        @test all(north_halo(field) .== arch.connectivity.north)
+        @test all(south_halo(field) .== arch.connectivity.south)
+    end
+
+    return nothing
+end
+
+function run_triply_periodic_halo_communication_tests_with_114_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        set!(field, arch.my_rank)
+        fill_halo_regions!(field, arch)
+
+        @test all(top_halo(field) .== arch.connectivity.top)
+        @test all(bottom_halo(field) .== arch.connectivity.bottom)
+    end
+
+    return nothing
+end
+
 #####
 ##### Run tests!
 #####
@@ -283,6 +317,8 @@ end
     @testset "Halo communication" begin
         @info "  Testing halo communication..."
         run_triply_periodic_halo_communication_tests_with_411_ranks()
+        run_triply_periodic_halo_communication_tests_with_141_ranks()
+        run_triply_periodic_halo_communication_tests_with_114_ranks()
     end
 
     # TODO: 221 ranks

From d18f6700e882344d72d9b29b42a364049ce446bb Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Feb 2021 23:59:56 -0500
Subject: [PATCH 056/100] Testing xy decompositions

---
 ...buted_model.jl => test_distributed_mpi.jl} | 241 +++++++++++++-----
 1 file changed, 178 insertions(+), 63 deletions(-)
 rename src/Distributed/{test_distributed_model.jl => test_distributed_mpi.jl} (56%)

diff --git a/src/Distributed/test_distributed_model.jl b/src/Distributed/test_distributed_mpi.jl
similarity index 56%
rename from src/Distributed/test_distributed_model.jl
rename to src/Distributed/test_distributed_mpi.jl
index 0121b32d61..c76b45b482 100644
--- a/src/Distributed/test_distributed_model.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -31,6 +31,10 @@ function run_triply_periodic_rank_connectivity_tests_with_411_ranks()
     @test isnothing(connectivity.top)
     @test isnothing(connectivity.bottom)
 
+    # +---+---+---+---+
+    # | 0 | 1 | 2 | 3 |
+    # +---+---+---+---+
+
     if my_rank == 0
         @test connectivity.east == 1
         @test connectivity.west == 3
@@ -64,6 +68,16 @@ function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
     @test isnothing(connectivity.top)
     @test isnothing(connectivity.bottom)
 
+    # +---+
+    # | 0 |
+    # +---+
+    # | 1 |
+    # +---+
+    # | 2 |
+    # +---+
+    # | 3 |
+    # +---+
+
     if my_rank == 0
         @test connectivity.north == 1
         @test connectivity.south == 3
@@ -97,6 +111,19 @@ function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
     @test isnothing(connectivity.north)
     @test isnothing(connectivity.south)
 
+    #   /---/
+    #  / 3 /
+    # /---/
+    #   /---/
+    #  / 2 /
+    # /---/
+    #   /---/
+    #  / 1 /
+    # /---/
+    #   /---/
+    #  / 0 /
+    # /---/
+
     if my_rank == 0
         @test connectivity.top == 1
         @test connectivity.bottom == 3
@@ -114,6 +141,51 @@ function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
     return nothing
 end
 
+function run_triply_periodic_rank_connectivity_tests_with_221_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
+
+    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
+
+    connectivity = arch.connectivity
+
+    # No communication in z.
+    @test isnothing(connectivity.top)
+    @test isnothing(connectivity.bottom)
+
+    # +---+---+
+    # | 0 | 2 |
+    # +---+---+
+    # | 1 | 3 |
+    # +---+---+
+
+    if my_rank == 0
+        @test connectivity.east == 2
+        @test connectivity.west == 2
+        @test connectivity.north == 1
+        @test connectivity.south == 1
+    elseif my_rank == 1
+        @test connectivity.east == 3
+        @test connectivity.west == 3
+        @test connectivity.north == 0
+        @test connectivity.south == 0
+    elseif my_rank == 2
+        @test connectivity.east == 0
+        @test connectivity.west == 0
+        @test connectivity.north == 3
+        @test connectivity.south == 3
+    elseif my_rank == 3
+        @test connectivity.east == 1
+        @test connectivity.west == 1
+        @test connectivity.north == 2
+        @test connectivity.south == 2
+    end
+
+    return nothing
+end
+
 #####
 ##### Local grids for distributed models
 #####
@@ -125,15 +197,15 @@ function run_triply_periodic_local_grid_tests_with_411_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_grid = dm.model.grid
+    nx, ny, nz = size(local_grid)
 
-    model = dm.model
-    nx, ny, nz = size(model.grid)
-    @test model.grid.xF[1] == 0.25*my_rank
-    @test model.grid.xF[nx+1] == 0.25*(my_rank+1)
-    @test model.grid.yF[1] == 0
-    @test model.grid.yF[ny+1] == 2
-    @test model.grid.zF[1] == -3
-    @test model.grid.zF[nz+1] == 0
+    @test local_grid.xF[1] == 0.25*my_rank
+    @test local_grid.xF[nx+1] == 0.25*(my_rank+1)
+    @test local_grid.yF[1] == 0
+    @test local_grid.yF[ny+1] == 2
+    @test local_grid.zF[1] == -3
+    @test local_grid.zF[nz+1] == 0
 
     return nothing
 end
@@ -145,15 +217,15 @@ function run_triply_periodic_local_grid_tests_with_141_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_grid = dm.model.grid
+    nx, ny, nz = size(local_grid)
 
-    model = dm.model
-    nx, ny, nz = size(model.grid)
-    @test model.grid.xF[1] == 0
-    @test model.grid.xF[nx+1] == 1
-    @test model.grid.yF[1] == 0.5*my_rank
-    @test model.grid.yF[ny+1] == 0.5*(my_rank+1)
-    @test model.grid.zF[1] == -3
-    @test model.grid.zF[nz+1] == 0
+    @test local_grid.xF[1] == 0
+    @test local_grid.xF[nx+1] == 1
+    @test local_grid.yF[1] == 0.5*my_rank
+    @test local_grid.yF[ny+1] == 0.5*(my_rank+1)
+    @test local_grid.zF[1] == -3
+    @test local_grid.zF[nz+1] == 0
 
     return nothing
 end
@@ -165,21 +237,41 @@ function run_triply_periodic_local_grid_tests_with_114_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_grid = dm.model.grid
+    nx, ny, nz = size(local_grid)
 
-    model = dm.model
-    nx, ny, nz = size(model.grid)
-    @test model.grid.xF[1] == 0
-    @test model.grid.xF[nx+1] == 1
-    @test model.grid.yF[1] == 0
-    @test model.grid.yF[ny+1] == 2
-    @test model.grid.zF[1] == -3 + 0.75*my_rank
-    @test model.grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
+    @test local_grid.xF[1] == 0
+    @test local_grid.xF[nx+1] == 1
+    @test local_grid.yF[1] == 0
+    @test local_grid.yF[ny+1] == 2
+    @test local_grid.zF[1] == -3 + 0.75*my_rank
+    @test local_grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
+
+    return nothing
+end
+
+function run_triply_periodic_local_grid_tests_with_221_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    i, j, k = arch.my_index
+    local_grid = dm.model.grid
+    nx, ny, nz = size(local_grid)
+
+    @test local_grid.xF[1] == 0.5*(i-1)
+    @test local_grid.xF[nx+1] == 0.5*i
+    @test local_grid.yF[1] == j-1
+    @test local_grid.yF[ny+1] == j
+    @test local_grid.zF[1] == -3
+    @test local_grid.zF[nz+1] == 0
 
     return nothing
 end
 
 #####
-#####
+##### Injection of halo communication BCs
 #####
 
 function run_triply_periodic_bc_injection_tests_with_411_ranks()
@@ -189,12 +281,13 @@ function run_triply_periodic_bc_injection_tests_with_411_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        @test field.boundary_conditions.east isa HaloCommunicationBC
-        @test field.boundary_conditions.west isa HaloCommunicationBC
-        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+        fbcs = field.boundary_conditions
+        @test fbcs.east isa HaloCommunicationBC
+        @test fbcs.west isa HaloCommunicationBC
+        @test !isa(fbcs.north, HaloCommunicationBC)
+        @test !isa(fbcs.south, HaloCommunicationBC)
+        @test !isa(fbcs.top, HaloCommunicationBC)
+        @test !isa(fbcs.bottom, HaloCommunicationBC)
     end
 end
 
@@ -205,12 +298,13 @@ function run_triply_periodic_bc_injection_tests_with_141_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
-        @test field.boundary_conditions.north isa HaloCommunicationBC
-        @test field.boundary_conditions.south isa HaloCommunicationBC
-        @test !isa(field.boundary_conditions.top, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.bottom, HaloCommunicationBC)
+        fbcs = field.boundary_conditions
+        @test !isa(fbcs.east, HaloCommunicationBC)
+        @test !isa(fbcs.west, HaloCommunicationBC)
+        @test fbcs.north isa HaloCommunicationBC
+        @test fbcs.south isa HaloCommunicationBC
+        @test !isa(fbcs.top, HaloCommunicationBC)
+        @test !isa(fbcs.bottom, HaloCommunicationBC)
     end
 end
 
@@ -221,12 +315,30 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        @test !isa(field.boundary_conditions.east, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.west, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.north, HaloCommunicationBC)
-        @test !isa(field.boundary_conditions.south, HaloCommunicationBC)
-        @test field.boundary_conditions.top isa HaloCommunicationBC
-        @test field.boundary_conditions.bottom isa HaloCommunicationBC
+        fbcs = field.boundary_conditions
+        @test !isa(fbcs.east, HaloCommunicationBC)
+        @test !isa(fbcs.west, HaloCommunicationBC)
+        @test !isa(fbcs.north, HaloCommunicationBC)
+        @test !isa(fbcs.south, HaloCommunicationBC)
+        @test fbcs.top isa HaloCommunicationBC
+        @test fbcs.bottom isa HaloCommunicationBC
+    end
+end
+
+function run_triply_periodic_bc_injection_tests_with_221_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        fbcs = field.boundary_conditions
+        @test fbcs.east isa HaloCommunicationBC
+        @test fbcs.west isa HaloCommunicationBC
+        @test fbcs.north isa HaloCommunicationBC
+        @test fbcs.south isa HaloCommunicationBC
+        @test !isa(fbcs.top, HaloCommunicationBC)
+        @test !isa(fbcs.bottom, HaloCommunicationBC)
     end
 end
 
@@ -292,26 +404,29 @@ end
 @testset "Distributed MPI Oceananigans" begin
     @info "Testing distributed MPI Oceananigans..."
 
-    @testset "Multi architectures rank connectivity" begin
-        @info "  Testing multi architecture rank connectivity..."
-        run_triply_periodic_rank_connectivity_tests_with_411_ranks()
-        run_triply_periodic_rank_connectivity_tests_with_141_ranks()
-        run_triply_periodic_rank_connectivity_tests_with_114_ranks()
-    end
-
-    @testset "Local grids for distributed models" begin
-        @info "  Testing local grids for distributed models..."
-        run_triply_periodic_local_grid_tests_with_411_ranks()
-        run_triply_periodic_local_grid_tests_with_141_ranks()
-        run_triply_periodic_local_grid_tests_with_114_ranks()
-    end
-
-    @testset "Injection of halo communication BCs" begin
-        @info "  Testing injection of halo communication BCs..."
-        run_triply_periodic_bc_injection_tests_with_411_ranks()
-        run_triply_periodic_bc_injection_tests_with_141_ranks()
-        run_triply_periodic_bc_injection_tests_with_114_ranks()
-    end
+    # @testset "Multi architectures rank connectivity" begin
+    #     @info "  Testing multi architecture rank connectivity..."
+    #     run_triply_periodic_rank_connectivity_tests_with_411_ranks()
+    #     run_triply_periodic_rank_connectivity_tests_with_141_ranks()
+    #     run_triply_periodic_rank_connectivity_tests_with_114_ranks()
+    #     run_triply_periodic_rank_connectivity_tests_with_221_ranks()
+    # end
+
+    # @testset "Local grids for distributed models" begin
+    #     @info "  Testing local grids for distributed models..."
+    #     run_triply_periodic_local_grid_tests_with_411_ranks()
+    #     run_triply_periodic_local_grid_tests_with_141_ranks()
+    #     run_triply_periodic_local_grid_tests_with_114_ranks()
+    #     run_triply_periodic_local_grid_tests_with_221_ranks()
+    # end
+
+    # @testset "Injection of halo communication BCs" begin
+    #     @info "  Testing injection of halo communication BCs..."
+    #     run_triply_periodic_bc_injection_tests_with_411_ranks()
+    #     run_triply_periodic_bc_injection_tests_with_141_ranks()
+    #     run_triply_periodic_bc_injection_tests_with_114_ranks()
+    #     run_triply_periodic_bc_injection_tests_with_221_ranks()
+    # end
 
     # TODO: Larger halos!
     @testset "Halo communication" begin

From fff4de7bcad283306b5e42baf4aa94d4e0b60d43 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 00:00:26 -0500
Subject: [PATCH 057/100] Add `include_corners` kwarg for halo functions

---
 src/Distributed/distributed_utils.jl | 47 ++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index 117ae74f3c..51dcd77301 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -1,17 +1,46 @@
-using Oceananigans.Grids: left_halo_indices, right_halo_indices, underlying_left_halo_indices, underlying_right_halo_indices
 using Oceananigans.Fields: AbstractField
+using Oceananigans.Grids:
+    interior_indices,
+    left_halo_indices, right_halo_indices,
+    underlying_left_halo_indices, underlying_right_halo_indices
 
-west_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
+# TODO: Move to Grids/grid_utils.jl
 
-east_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, right_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :)
+west_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :) :
+                      view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx),
+                                   interior_indices(LY, topology(f, 2), f.grid.Ny),
+                                   interior_indices(LZ, topology(f, 3), f.grid.Nz))
 
-south_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, :, left_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
+east_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, right_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :) :
+                      view(f.data, right_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx),
+                                   interior_indices(LY, topology(f, 2), f.grid.Ny),
+                                   interior_indices(LZ, topology(f, 3), f.grid.Nz))
 
-north_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, :, right_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :)
+south_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, left_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :) :
+                      view(f.data, interior_indices(LX, topology(f, 1), f.grid.Nx),
+                                   left_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy),
+                                   interior_indices(LZ, topology(f, 3), f.grid.Nz))
+
+north_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, right_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy), :) :
+                      view(f.data, interior_indices(LX, topology(f, 1), f.grid.Nx),
+                                   right_halo_indices(LY, topology(f, 2), f.grid.Ny, f.grid.Hy),
+                                   interior_indices(LZ, topology(f, 3), f.grid.Nz))
+
+bottom_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz)) :
+                      view(f.data, interior_indices(LX, topology(f, 1), f.grid.Nx),
+                                   interior_indices(LY, topology(f, 2), f.grid.Ny),
+                                   left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz))
+
+top_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, :, right_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz)) :
+                      view(f.data, interior_indices(LX, topology(f, 1), f.grid.Nx),
+                                   interior_indices(LY, topology(f, 2), f.grid.Ny),
+                                   left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz))
 
 bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)

From 0dec95812ab0da75873a517f0d17517bb42eb4c6 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 00:07:20 -0500
Subject: [PATCH 058/100] Test that halo communication doesn't leak

---
 src/Distributed/test_distributed_mpi.jl | 91 ++++++++++++++++++-------
 1 file changed, 65 insertions(+), 26 deletions(-)

diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index c76b45b482..f52a402ab0 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -358,6 +358,12 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
 
         @test all(east_halo(field) .== arch.connectivity.east)
         @test all(west_halo(field) .== arch.connectivity.west)
+
+        @test all(interior(field) .== arch.my_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
     end
 
     return nothing
@@ -375,6 +381,12 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
 
         @test all(north_halo(field) .== arch.connectivity.north)
         @test all(south_halo(field) .== arch.connectivity.south)
+
+        @test all(interior(field) .== arch.my_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
     end
 
     return nothing
@@ -392,6 +404,35 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
 
         @test all(top_halo(field) .== arch.connectivity.top)
         @test all(bottom_halo(field) .== arch.connectivity.bottom)
+
+        @test all(interior(field) .== arch.my_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.my_rank)
+    end
+
+    return nothing
+end
+
+function run_triply_periodic_halo_communication_tests_with_221_ranks()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    for field in fields(dm.model)
+        set!(field, arch.my_rank)
+        fill_halo_regions!(field, arch)
+
+        @test all(east_halo(field) .== arch.connectivity.east)
+        @test all(west_halo(field) .== arch.connectivity.west)
+        @test all(north_halo(field) .== arch.connectivity.north)
+        @test all(south_halo(field) .== arch.connectivity.south)
+
+        @test all(interior(field) .== arch.my_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
     end
 
     return nothing
@@ -404,29 +445,29 @@ end
 @testset "Distributed MPI Oceananigans" begin
     @info "Testing distributed MPI Oceananigans..."
 
-    # @testset "Multi architectures rank connectivity" begin
-    #     @info "  Testing multi architecture rank connectivity..."
-    #     run_triply_periodic_rank_connectivity_tests_with_411_ranks()
-    #     run_triply_periodic_rank_connectivity_tests_with_141_ranks()
-    #     run_triply_periodic_rank_connectivity_tests_with_114_ranks()
-    #     run_triply_periodic_rank_connectivity_tests_with_221_ranks()
-    # end
-
-    # @testset "Local grids for distributed models" begin
-    #     @info "  Testing local grids for distributed models..."
-    #     run_triply_periodic_local_grid_tests_with_411_ranks()
-    #     run_triply_periodic_local_grid_tests_with_141_ranks()
-    #     run_triply_periodic_local_grid_tests_with_114_ranks()
-    #     run_triply_periodic_local_grid_tests_with_221_ranks()
-    # end
-
-    # @testset "Injection of halo communication BCs" begin
-    #     @info "  Testing injection of halo communication BCs..."
-    #     run_triply_periodic_bc_injection_tests_with_411_ranks()
-    #     run_triply_periodic_bc_injection_tests_with_141_ranks()
-    #     run_triply_periodic_bc_injection_tests_with_114_ranks()
-    #     run_triply_periodic_bc_injection_tests_with_221_ranks()
-    # end
+    @testset "Multi architectures rank connectivity" begin
+        @info "  Testing multi architecture rank connectivity..."
+        run_triply_periodic_rank_connectivity_tests_with_411_ranks()
+        run_triply_periodic_rank_connectivity_tests_with_141_ranks()
+        run_triply_periodic_rank_connectivity_tests_with_114_ranks()
+        run_triply_periodic_rank_connectivity_tests_with_221_ranks()
+    end
+
+    @testset "Local grids for distributed models" begin
+        @info "  Testing local grids for distributed models..."
+        run_triply_periodic_local_grid_tests_with_411_ranks()
+        run_triply_periodic_local_grid_tests_with_141_ranks()
+        run_triply_periodic_local_grid_tests_with_114_ranks()
+        run_triply_periodic_local_grid_tests_with_221_ranks()
+    end
+
+    @testset "Injection of halo communication BCs" begin
+        @info "  Testing injection of halo communication BCs..."
+        run_triply_periodic_bc_injection_tests_with_411_ranks()
+        run_triply_periodic_bc_injection_tests_with_141_ranks()
+        run_triply_periodic_bc_injection_tests_with_114_ranks()
+        run_triply_periodic_bc_injection_tests_with_221_ranks()
+    end
 
     # TODO: Larger halos!
     @testset "Halo communication" begin
@@ -434,10 +475,8 @@ end
         run_triply_periodic_halo_communication_tests_with_411_ranks()
         run_triply_periodic_halo_communication_tests_with_141_ranks()
         run_triply_periodic_halo_communication_tests_with_114_ranks()
+        # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
-
-    # TODO: 221 ranks
-    # TODO: triply bounded
 end
 
 # MPI.Finalize()

From 7dbe2bb137b5ec07791ac4496e2ff44f04f51beb Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 09:17:59 -0500
Subject: [PATCH 059/100] Distributed FFT based Poisson solver

---
 .../distributed_fft_based_poisson_solver.jl   | 66 +++++++++++++++++++
 src/Distributed/distributed_utils.jl          |  2 +-
 src/Distributed/halo_communication.jl         |  2 +-
 3 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 src/Distributed/distributed_fft_based_poisson_solver.jl

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
new file mode 100644
index 0000000000..df384b3fbd
--- /dev/null
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -0,0 +1,66 @@
+using PencilFFTs
+
+struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S}
+              plan :: P
+         full_grid :: F
+           my_grid :: L
+    my_eigenvalues :: λ
+           storage :: S
+end
+
+reshaped_size(N, dim) = dim == 1 ? (N, 1, 1) :
+                        dim == 2 ? (1, N, 1) :
+                        dim == 3 ? (1, 1, N) : nothing
+
+function poisson_eigenvalues(N, L, dim, ::Periodic)
+    inds = reshape(1:N, reshaped_size(N, dim)...)
+    return @. (2sin((inds - 1) * π / N) / (L / N))^2
+end
+
+function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
+    topo = (TX, TY, TZ) = topology(full_grid)
+
+    λx = poisson_eigenvalues(full_grid.Nx, full_grid.Lx, 1, TX())
+    λy = poisson_eigenvalues(full_grid.Ny, full_grid.Ly, 2, TY())
+    λz = poisson_eigenvalues(full_grid.Nz, full_grid.Lz, 3, TZ())
+
+    I, J, K = arch.my_index
+    my_eigenvalues = (
+        λx = λx[(I-1)*local_grid.Nx+1:I*local_grid.Nx, :, :],
+        λy = λy[:, (J-1)*local_grid.Ny+1:J*local_grid.Ny, :],
+        λz = λz[:, :, (K-1)*local_grid.Nz+1:K*local_grid.Nz]
+    )
+
+    transform = PencilFFTs.Transforms.FFT!()
+    proc_dims = (arch.ranks[2], arch.ranks[3])
+    plan = PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
+
+    storage = allocate_input(plan)
+
+    return DistributedFFTBasedPoissonSolver(plan, full_grid, local_grid, my_eigenvalues, storage)
+end
+
+function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
+    λx, λy, λz = solver.my_eigenvalues
+
+    # https://jipolanco.github.io/PencilFFTs.jl/dev/PencilFFTs/#PencilFFTs.allocate_input
+    RHS = ϕ = first(solver.storage)
+
+    # Apply forward transforms.
+    solver.plan * solver.storage
+
+    # Solve the discrete Poisson equation.
+    @. ϕ = -RHS / (λx + λy + λz)
+
+    # Setting DC component of the solution (the mean) to be zero. This is also
+    # necessary because the source term to the Poisson equation has zero mean
+    # and so the DC component comes out to be ∞.
+    if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+        ϕ[1, 1, 1] = 0
+    end
+
+    # Apply backward transforms.
+    solver.plan \ solver.storage
+
+    return nothing
+end
diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index 51dcd77301..1fef722396 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -58,7 +58,7 @@ underlying_south_halo(f, grid, location) =
     view(f.parent, :, underlying_left_halo_indices(location, topology(grid, 2), grid.Ny, grid.Hy), :)
 
 underlying_north_halo(f, grid, location) =
-    view(f.parent, :, underlying_right_halo_indices(location, topology(grid, 2), grid.Nz, grid.Hz), :)
+    view(f.parent, :, underlying_right_halo_indices(location, topology(grid, 2), grid.Ny, grid.Hy), :)
 
 underlying_bottom_halo(f, grid, location) =
     view(f.parent, :, :, underlying_left_halo_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 2dc48f4857..bc82f48fd3 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -149,7 +149,7 @@ for side in sides
 end
 
 #####
-##### Receiving and filling halos (buffer is a view so should get filled upon receive)
+##### Receiving and filling halos (buffer is a view so it gets filled upon receive)
 #####
 
 for side in sides

From da21e2f522d994054b4029b150ed84e47cdebb37 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 11:21:16 -0500
Subject: [PATCH 060/100] Janky Poisson solve works out

---
 .../distributed_fft_based_poisson_solver.jl   |  37 ++++--
 .../test_distributed_poisson_solvers.jl       | 116 ++++++++++++++++++
 2 files changed, 141 insertions(+), 12 deletions(-)
 create mode 100644 src/Distributed/test_distributed_poisson_solvers.jl

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index df384b3fbd..480e163788 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -24,14 +24,17 @@ function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
     λy = poisson_eigenvalues(full_grid.Ny, full_grid.Ly, 2, TY())
     λz = poisson_eigenvalues(full_grid.Nz, full_grid.Lz, 3, TZ())
 
-    I, J, K = arch.my_index
-    my_eigenvalues = (
-        λx = λx[(I-1)*local_grid.Nx+1:I*local_grid.Nx, :, :],
-        λy = λy[:, (J-1)*local_grid.Ny+1:J*local_grid.Ny, :],
-        λz = λz[:, :, (K-1)*local_grid.Nz+1:K*local_grid.Nz]
-    )
-
-    transform = PencilFFTs.Transforms.FFT!()
+    my_eigenvalues = (; λx, λy, λz)
+
+    # I, J, K = arch.my_index
+    # my_eigenvalues = (
+    #     λx = λx[(I-1)*local_grid.Nx+1:I*local_grid.Nx, :, :],
+    #     λy = λy[:, (J-1)*local_grid.Ny+1:J*local_grid.Ny, :],
+    #     λz = λz[:, :, (K-1)*local_grid.Nz+1:K*local_grid.Nz]
+    # )
+
+    # transform = PencilFFTs.Transforms.FFT!()
+    transform = PencilFFTs.Transforms.FFT()
     proc_dims = (arch.ranks[2], arch.ranks[3])
     plan = PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
 
@@ -44,13 +47,22 @@ function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
     λx, λy, λz = solver.my_eigenvalues
 
     # https://jipolanco.github.io/PencilFFTs.jl/dev/PencilFFTs/#PencilFFTs.allocate_input
-    RHS = ϕ = first(solver.storage)
+    # RHS = ϕ = first(solver.storage)
+    RHS = ϕ = solver.storage
 
     # Apply forward transforms.
-    solver.plan * solver.storage
+    # ϕ = solver.plan * solver.storage
+    ϕ = solver.plan * RHS
+
+    @show size(RHS)
+    @show size(ϕ)
+
+    λx = reshape(λx, 1, solver.my_grid.Nx, 1)
+    λy = reshape(λy, solver.my_grid.Ny, 1, 1)
 
     # Solve the discrete Poisson equation.
-    @. ϕ = -RHS / (λx + λy + λz)
+    # @. ϕ = -RHS / (λx + λy + λz)
+    @. ϕ = -ϕ / (λx + λy + λz)
 
     # Setting DC component of the solution (the mean) to be zero. This is also
     # necessary because the source term to the Poisson equation has zero mean
@@ -60,7 +72,8 @@ function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
     end
 
     # Apply backward transforms.
-    solver.plan \ solver.storage
+    # solver.plan \ solver.storage
+    solver.storage .= solver.plan \ ϕ
 
     return nothing
 end
diff --git a/src/Distributed/test_distributed_poisson_solvers.jl b/src/Distributed/test_distributed_poisson_solvers.jl
new file mode 100644
index 0000000000..993004b807
--- /dev/null
+++ b/src/Distributed/test_distributed_poisson_solvers.jl
@@ -0,0 +1,116 @@
+using Test
+using Oceananigans
+using Oceananigans.Architectures
+using Oceananigans.Solvers
+using Oceananigans.Utils
+using Oceananigans.Operators
+using Oceananigans.BoundaryConditions: fill_halo_regions!
+using KernelAbstractions: @kernel, @index, Event
+
+@kernel function ∇²!(grid, f, ∇²f)
+    i, j, k = @index(Global, NTuple)
+    @inbounds ∇²f[i, j, k] = ∇²(i, j, k, grid, f)
+end
+
+@kernel function divergence!(grid, u, v, w, div)
+    i, j, k = @index(Global, NTuple)
+    @inbounds div[i, j, k] = divᶜᶜᶜ(i, j, k, grid, u, v, w)
+end
+
+function random_divergent_source_term(FT, arch, grid)
+    # Generate right hand side from a random (divergent) velocity field.
+    Ru = CenterField(FT, arch, grid, UVelocityBoundaryConditions(grid))
+    Rv = CenterField(FT, arch, grid, VVelocityBoundaryConditions(grid))
+    Rw = CenterField(FT, arch, grid, WVelocityBoundaryConditions(grid))
+    U = (u=Ru, v=Rv, w=Rw)
+
+    Nx, Ny, Nz = size(grid)
+    set!(Ru, rand(Nx, Ny, Nz))
+    set!(Rv, rand(Nx, Ny, Nz))
+    set!(Rw, rand(Nx, Ny, Nz))
+
+    # Adding (nothing, nothing) in case we need to dispatch on ::NFBC
+    fill_halo_regions!(Ru, arch, nothing, nothing)
+    fill_halo_regions!(Rv, arch, nothing, nothing)
+    fill_halo_regions!(Rw, arch, nothing, nothing)
+
+    # Compute the right hand side R = ∇⋅U
+    ArrayType = array_type(arch)
+    R = zeros(Nx, Ny, Nz) |> ArrayType
+    event = launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R,
+                    dependencies=Event(device(arch)))
+    wait(device(arch), event)
+
+    return R
+end
+
+function compute_∇²!(∇²ϕ, ϕ, arch, grid)
+    fill_halo_regions!(ϕ, arch)
+    child_arch = child_architecture(arch)
+    event = launch!(child_arch, grid, :xyz, ∇²!, grid, ϕ.data, ∇²ϕ.data, dependencies=Event(device(child_arch)))
+    wait(device(child_arch), event)
+    fill_halo_regions!(∇²ϕ, arch)
+    return nothing
+end
+
+function divergence_free_poisson_solution_triply_periodic()
+    topo = (Periodic, Periodic, Periodic)
+    full_grid = RegularCartesianGrid(topology=topo, size=(16, 16, 1), extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    dm = DistributedModel(architecture=arch, grid=full_grid)
+
+    local_grid = dm.model.grid
+    solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
+
+    R = random_divergent_source_term(Float64, child_architecture(arch), local_grid)
+    # first(solver.storage) .= R
+    solver.storage .= R
+
+    solve_poisson_equation!(solver)
+
+    p_bcs = PressureBoundaryConditions(local_grid)
+    p_bcs = inject_halo_communication_boundary_conditions(p_bcs, arch.my_rank, arch.connectivity)
+
+    ϕ   = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)  # "pressure"
+    ∇²ϕ = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)
+
+    interior(ϕ) .= real(solver.storage)
+    compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
+
+    return nothing
+end
+
+topo = (Periodic, Periodic, Periodic)
+full_grid = RegularCartesianGrid(topology=topo, size=(16, 16, 1), extent=(1, 2, 3))
+arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+dm = DistributedModel(architecture=arch, grid=full_grid)
+local_grid = dm.model.grid
+solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
+Random.seed!(0)
+R = rand(size(full_grid)...)
+I, J, K = arch.my_index
+R = R[:, local_grid.Ny*(J-1)+1:local_grid.Ny*J, :]
+solver.storage .= R
+F = solver.plan * solver.storage
+λx, λy, λz = solver.my_eigenvalues
+λx = λx[(J-1)*local_grid.Ny+1:J*local_grid.Ny, :, :]
+@. F = -F / (λx + λy + λz)
+if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+    F[1, 1, 1] = 0
+end
+B = real(solver.plan \ F)
+
+Nx, Ny, Nz = 16, 16, 1
+Lx, Ly, Lz = 1, 2, 3
+Random.seed!(0)
+R = rand(16, 16, 1)
+F = fft(R)
+λx = @. (2sin((0:Nx - 1) * π / Nx) / (Lx / Nx))^2
+λy = @. (2sin((0:Ny - 1) * π / Ny) / (Ly / Ny))^2
+λz = @. (2sin((0:Nz - 1) * π / Nz) / (Lz / Nz))^2
+λx = reshape(λx, Nx, 1, 1)
+λy = reshape(λy, 1, Ny, 1)
+λz = reshape(λz, 1, 1, Nz)
+@. F = -F / (λx + λy + λz)
+F[1, 1, 1] = 0
+B = real(ifft(F))

From 20d5f076bcafd07f87597f39b6d9dbcfc16eb033 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 12:30:59 -0500
Subject: [PATCH 061/100] Gotta send boundary points, not halos!

---
 src/Distributed/distributed_utils.jl    | 44 +++++++++++++++++++++----
 src/Distributed/halo_communication.jl   |  4 +--
 src/Distributed/test_distributed_mpi.jl | 10 +++---
 3 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index 1fef722396..8ce5fe3827 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -6,6 +6,10 @@ using Oceananigans.Grids:
 
 # TODO: Move to Grids/grid_utils.jl
 
+#####
+##### Viewing halos
+#####
+
 west_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
     include_corners ? view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx), :, :) :
                       view(f.data, left_halo_indices(LX, topology(f, 1), f.grid.Nx, f.grid.Hx),
@@ -42,12 +46,6 @@ top_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ}
                                    interior_indices(LY, topology(f, 2), f.grid.Ny),
                                    left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz))
 
-bottom_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, :, :, left_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
-
-top_halo(f::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
-    view(f.data, :, :, right_halo_indices(LZ, topology(f, 3), f.grid.Nz, f.grid.Hz), :)
-
 underlying_west_halo(f, grid, location) =
     view(f.parent, underlying_left_halo_indices(location, topology(grid, 1), grid.Nx, grid.Hx), :, :)
 
@@ -65,3 +63,37 @@ underlying_bottom_halo(f, grid, location) =
 
 underlying_top_halo(f, grid, location) =
     view(f.parent, :, :, underlying_right_halo_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
+
+#####
+##### Viewing boundary grid points (used to fill other halos)
+#####
+
+left_boundary_indices(loc, topo, N, H) = 1:H
+left_boundary_indices(::Type{Nothing}, topo, N, H) = 1:0 # empty
+
+right_boundary_indices(loc, topo, N, H) = N-H+1:N
+right_boundary_indices(::Type{Nothing}, topo, N, H) = 1:0 # empty
+
+underlying_left_boundary_indices(loc, topo, N, H) = 1+H:2H
+underlying_left_boundary_indices(::Type{Nothing}, topo, N, H) = 1:0 # empty
+
+underlying_right_boundary_indices(loc, topo, N, H) = N+1:N+H
+underlying_right_boundary_indices(::Type{Nothing}, topo, N, H) = 1:0 # empty
+
+underlying_west_boundary(f, grid, location) =
+    view(f.parent, underlying_left_boundary_indices(location, topology(grid, 1), grid.Nx, grid.Hx), :, :)
+
+underlying_east_boundary(f, grid, location) =
+    view(f.parent, underlying_right_boundary_indices(location, topology(grid, 1), grid.Nx, grid.Hx), :, :)
+
+underlying_south_boundary(f, grid, location) =
+    view(f.parent, :, underlying_left_boundary_indices(location, topology(grid, 2), grid.Ny, grid.Hy), :)
+
+underlying_north_boundary(f, grid, location) =
+    view(f.parent, :, underlying_right_boundary_indices(location, topology(grid, 2), grid.Ny, grid.Hy), :)
+
+underlying_bottom_boundary(f, grid, location) =
+    view(f.parent, :, :, underlying_left_boundary_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
+
+underlying_top_boundary(f, grid, location) =
+    view(f.parent, :, :, underlying_right_boundary_indices(location, topology(grid, 3), grid.Nz, grid.Hz))
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index bc82f48fd3..d4515a7de0 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -132,12 +132,12 @@ end
 for side in sides
     side_str = string(side)
     send_side_halo = Symbol("send_$(side)_halo")
-    underlying_side_halo = Symbol("underlying_$(side)_halo")
+    underlying_side_boundary = Symbol("underlying_$(side)_boundary")
     side_send_tag = Symbol("$(side)_send_tag")
 
     @eval begin
         function $send_side_halo(c, grid, c_location, my_rank, rank_to_send_to)
-            send_buffer = $underlying_side_halo(c, grid, c_location)
+            send_buffer = $underlying_side_boundary(c, grid, c_location)
             send_tag = $side_send_tag(my_rank, rank_to_send_to)
 
             @debug "Sending " * $side_str * " halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index f52a402ab0..e860c22cbd 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -353,7 +353,7 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        set!(field, arch.my_rank)
+        interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
         @test all(east_halo(field) .== arch.connectivity.east)
@@ -376,7 +376,7 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        set!(field, arch.my_rank)
+        interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
         @test all(north_halo(field) .== arch.connectivity.north)
@@ -399,7 +399,7 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        set!(field, arch.my_rank)
+        interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
         @test all(top_halo(field) .== arch.connectivity.top)
@@ -422,7 +422,7 @@ function run_triply_periodic_halo_communication_tests_with_221_ranks()
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     for field in fields(dm.model)
-        set!(field, arch.my_rank)
+        interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
         @test all(east_halo(field) .== arch.connectivity.east)
@@ -469,7 +469,7 @@ end
         run_triply_periodic_bc_injection_tests_with_221_ranks()
     end
 
-    # TODO: Larger halos!
+    # TODO: Test larger halos!
     @testset "Halo communication" begin
         @info "  Testing halo communication..."
         run_triply_periodic_halo_communication_tests_with_411_ranks()

From 5e14925a0586eef260e2d100c6efaa4d56a4896b Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 12:34:14 -0500
Subject: [PATCH 062/100] Distributed Poisson solver solution is
 divergence-free!

---
 .../distributed_fft_based_poisson_solver.jl   | 23 +++---------
 .../test_distributed_poisson_solvers.jl       | 37 +------------------
 2 files changed, 7 insertions(+), 53 deletions(-)

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 480e163788..1f3fce1fa5 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -4,7 +4,7 @@ struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S}
               plan :: P
          full_grid :: F
            my_grid :: L
-    my_eigenvalues :: λ
+       eigenvalues :: λ
            storage :: S
 end
 
@@ -24,27 +24,22 @@ function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
     λy = poisson_eigenvalues(full_grid.Ny, full_grid.Ly, 2, TY())
     λz = poisson_eigenvalues(full_grid.Nz, full_grid.Lz, 3, TZ())
 
-    my_eigenvalues = (; λx, λy, λz)
+    I, J, K = arch.my_index
+    λx = λx[(J-1)*local_grid.Ny+1:J*local_grid.Ny, :, :]
 
-    # I, J, K = arch.my_index
-    # my_eigenvalues = (
-    #     λx = λx[(I-1)*local_grid.Nx+1:I*local_grid.Nx, :, :],
-    #     λy = λy[:, (J-1)*local_grid.Ny+1:J*local_grid.Ny, :],
-    #     λz = λz[:, :, (K-1)*local_grid.Nz+1:K*local_grid.Nz]
-    # )
+    eigenvalues = (; λx, λy, λz)
 
     # transform = PencilFFTs.Transforms.FFT!()
     transform = PencilFFTs.Transforms.FFT()
     proc_dims = (arch.ranks[2], arch.ranks[3])
     plan = PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
-
     storage = allocate_input(plan)
 
-    return DistributedFFTBasedPoissonSolver(plan, full_grid, local_grid, my_eigenvalues, storage)
+    return DistributedFFTBasedPoissonSolver(plan, full_grid, local_grid, eigenvalues, storage)
 end
 
 function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
-    λx, λy, λz = solver.my_eigenvalues
+    λx, λy, λz = solver.eigenvalues
 
     # https://jipolanco.github.io/PencilFFTs.jl/dev/PencilFFTs/#PencilFFTs.allocate_input
     # RHS = ϕ = first(solver.storage)
@@ -54,12 +49,6 @@ function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
     # ϕ = solver.plan * solver.storage
     ϕ = solver.plan * RHS
 
-    @show size(RHS)
-    @show size(ϕ)
-
-    λx = reshape(λx, 1, solver.my_grid.Nx, 1)
-    λy = reshape(λy, solver.my_grid.Ny, 1, 1)
-
     # Solve the discrete Poisson equation.
     # @. ϕ = -RHS / (λx + λy + λz)
     @. ϕ = -ϕ / (λx + λy + λz)
diff --git a/src/Distributed/test_distributed_poisson_solvers.jl b/src/Distributed/test_distributed_poisson_solvers.jl
index 993004b807..785418e6c5 100644
--- a/src/Distributed/test_distributed_poisson_solvers.jl
+++ b/src/Distributed/test_distributed_poisson_solvers.jl
@@ -77,40 +77,5 @@ function divergence_free_poisson_solution_triply_periodic()
     interior(ϕ) .= real(solver.storage)
     compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
 
-    return nothing
-end
-
-topo = (Periodic, Periodic, Periodic)
-full_grid = RegularCartesianGrid(topology=topo, size=(16, 16, 1), extent=(1, 2, 3))
-arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-dm = DistributedModel(architecture=arch, grid=full_grid)
-local_grid = dm.model.grid
-solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
-Random.seed!(0)
-R = rand(size(full_grid)...)
-I, J, K = arch.my_index
-R = R[:, local_grid.Ny*(J-1)+1:local_grid.Ny*J, :]
-solver.storage .= R
-F = solver.plan * solver.storage
-λx, λy, λz = solver.my_eigenvalues
-λx = λx[(J-1)*local_grid.Ny+1:J*local_grid.Ny, :, :]
-@. F = -F / (λx + λy + λz)
-if MPI.Comm_rank(MPI.COMM_WORLD) == 0
-    F[1, 1, 1] = 0
+    return @test R ≈ interior(∇²ϕ)
 end
-B = real(solver.plan \ F)
-
-Nx, Ny, Nz = 16, 16, 1
-Lx, Ly, Lz = 1, 2, 3
-Random.seed!(0)
-R = rand(16, 16, 1)
-F = fft(R)
-λx = @. (2sin((0:Nx - 1) * π / Nx) / (Lx / Nx))^2
-λy = @. (2sin((0:Ny - 1) * π / Ny) / (Ly / Ny))^2
-λz = @. (2sin((0:Nz - 1) * π / Nz) / (Lz / Nz))^2
-λx = reshape(λx, Nx, 1, 1)
-λy = reshape(λy, 1, Ny, 1)
-λz = reshape(λz, 1, 1, Nz)
-@. F = -F / (λx + λy + λz)
-F[1, 1, 1] = 0
-B = real(ifft(F))

From 42c36e40c8d37e1f52d7749241002e40383f81d3 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 12:57:09 -0500
Subject: [PATCH 063/100] In-place distributed FFTs

---
 .../distributed_fft_based_poisson_solver.jl     | 17 +++++------------
 src/Distributed/test_distributed_mpi.jl         |  2 ++
 .../test_distributed_poisson_solvers.jl         | 12 ++++++++----
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 1f3fce1fa5..8ec92d10a5 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -29,8 +29,7 @@ function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
 
     eigenvalues = (; λx, λy, λz)
 
-    # transform = PencilFFTs.Transforms.FFT!()
-    transform = PencilFFTs.Transforms.FFT()
+    transform = PencilFFTs.Transforms.FFT!()
     proc_dims = (arch.ranks[2], arch.ranks[3])
     plan = PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
     storage = allocate_input(plan)
@@ -41,17 +40,12 @@ end
 function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
     λx, λy, λz = solver.eigenvalues
 
-    # https://jipolanco.github.io/PencilFFTs.jl/dev/PencilFFTs/#PencilFFTs.allocate_input
-    # RHS = ϕ = first(solver.storage)
-    RHS = ϕ = solver.storage
-
     # Apply forward transforms.
-    # ϕ = solver.plan * solver.storage
-    ϕ = solver.plan * RHS
+    solver.plan * solver.storage
 
     # Solve the discrete Poisson equation.
-    # @. ϕ = -RHS / (λx + λy + λz)
-    @. ϕ = -ϕ / (λx + λy + λz)
+    RHS = ϕ = solver.storage[2]
+    @. ϕ = - RHS / (λx + λy + λz)
 
     # Setting DC component of the solution (the mean) to be zero. This is also
     # necessary because the source term to the Poisson equation has zero mean
@@ -61,8 +55,7 @@ function solve_poisson_equation!(solver::DistributedFFTBasedPoissonSolver)
     end
 
     # Apply backward transforms.
-    # solver.plan \ solver.storage
-    solver.storage .= solver.plan \ ϕ
+    solver.plan \ solver.storage
 
     return nothing
 end
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index e860c22cbd..1b33d98ade 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -477,6 +477,8 @@ end
         run_triply_periodic_halo_communication_tests_with_114_ranks()
         # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
+
+    include("test_distributed_poisson_solvers.jl")
 end
 
 # MPI.Finalize()
diff --git a/src/Distributed/test_distributed_poisson_solvers.jl b/src/Distributed/test_distributed_poisson_solvers.jl
index 785418e6c5..ce8b5443b0 100644
--- a/src/Distributed/test_distributed_poisson_solvers.jl
+++ b/src/Distributed/test_distributed_poisson_solvers.jl
@@ -63,8 +63,7 @@ function divergence_free_poisson_solution_triply_periodic()
     solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
 
     R = random_divergent_source_term(Float64, child_architecture(arch), local_grid)
-    # first(solver.storage) .= R
-    solver.storage .= R
+    first(solver.storage) .= R
 
     solve_poisson_equation!(solver)
 
@@ -74,8 +73,13 @@ function divergence_free_poisson_solution_triply_periodic()
     ϕ   = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)  # "pressure"
     ∇²ϕ = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)
 
-    interior(ϕ) .= real(solver.storage)
+    interior(ϕ) .= real(first(solver.storage))
     compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
 
-    return @test R ≈ interior(∇²ϕ)
+    return R ≈ interior(∇²ϕ)
+end
+
+@testset "Distributed FFT-based Poisson solver" begin
+    @info "  Testing distributed FFT-based Poisson solver..."
+    @test divergence_free_poisson_solution_triply_periodic()
 end

From 999d1f4ff967cb1826ffa735752c7a660d1b1241 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 13:17:48 -0500
Subject: [PATCH 064/100] Distributed Poisson solver needs work for rectangular
 grids

---
 src/Distributed/distributed_fft_based_poisson_solver.jl | 2 ++
 src/Distributed/distributed_model.jl                    | 2 ++
 src/Distributed/test_distributed_poisson_solvers.jl     | 9 +++++----
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 8ec92d10a5..b6d8399cf0 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -1,5 +1,7 @@
 using PencilFFTs
 
+import Oceananigans.Solvers: solve_poisson_equation!
+
 struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S}
               plan :: P
          full_grid :: F
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index c2d02c16d5..0caf77e4fa 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -7,6 +7,7 @@ include("distributed_utils.jl")
 include("distributed_architectures.jl")
 include("halo_communication_bcs.jl")
 include("halo_communication.jl")
+include("distributed_fft_based_poisson_solver.jl")
 
 #####
 ##### Distributed model struct and constructor
@@ -77,6 +78,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
                architecture = child_architecture(architecture),
                        grid = my_grid,
         boundary_conditions = communicative_bcs,
+          # pressure_solver = DistributedFFTBasedPoissonSolver(architecture, grid, my_grid),
         model_kwargs...
     )
 
diff --git a/src/Distributed/test_distributed_poisson_solvers.jl b/src/Distributed/test_distributed_poisson_solvers.jl
index ce8b5443b0..0feae84a48 100644
--- a/src/Distributed/test_distributed_poisson_solvers.jl
+++ b/src/Distributed/test_distributed_poisson_solvers.jl
@@ -53,10 +53,10 @@ function compute_∇²!(∇²ϕ, ϕ, arch, grid)
     return nothing
 end
 
-function divergence_free_poisson_solution_triply_periodic()
+function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(16, 16, 1), extent=(1, 2, 3))
-    arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    full_grid = RegularCartesianGrid(topology=topo, size=grid_points, extent=(1, 2, 3))
+    arch = MultiCPU(grid=full_grid, ranks=ranks)
     dm = DistributedModel(architecture=arch, grid=full_grid)
 
     local_grid = dm.model.grid
@@ -81,5 +81,6 @@ end
 
 @testset "Distributed FFT-based Poisson solver" begin
     @info "  Testing distributed FFT-based Poisson solver..."
-    @test divergence_free_poisson_solution_triply_periodic()
+    @test divergence_free_poisson_solution_triply_periodic((16, 16, 1), (1, 4, 1))
+    @test divergence_free_poisson_solution_triply_periodic((64, 64, 1), (1, 4, 1))
 end

From 1b2bda26776b7fb82d837130687cef26fbfa23d3 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 15:01:49 -0500
Subject: [PATCH 065/100] Pass distributed pressure solver to model

---
 src/Distributed/distributed_model.jl    |  5 ++++-
 src/Distributed/test_distributed_mpi.jl | 24 ++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 0caf77e4fa..4d0b93e0e3 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -74,11 +74,14 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
 
     ## Construct local model
 
+    pressure_solver = haskey(model_kwargs, :pressure_solver) ? Dict(model_kwargs)[:pressure_solver] :
+                                                               DistributedFFTBasedPoissonSolver(architecture, grid, my_grid)
+
     my_model = IncompressibleModel(;
                architecture = child_architecture(architecture),
                        grid = my_grid,
         boundary_conditions = communicative_bcs,
-          # pressure_solver = DistributedFFTBasedPoissonSolver(architecture, grid, my_grid),
+            pressure_solver = pressure_solver,
         model_kwargs...
     )
 
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index 1b33d98ade..6af7f94dea 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -194,7 +194,7 @@ function run_triply_periodic_local_grid_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -214,7 +214,7 @@ function run_triply_periodic_local_grid_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -234,7 +234,7 @@ function run_triply_periodic_local_grid_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -254,7 +254,7 @@ function run_triply_periodic_local_grid_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     i, j, k = arch.my_index
     local_grid = dm.model.grid
@@ -278,7 +278,7 @@ function run_triply_periodic_bc_injection_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -295,7 +295,7 @@ function run_triply_periodic_bc_injection_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -312,7 +312,7 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -329,7 +329,7 @@ function run_triply_periodic_bc_injection_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -350,7 +350,7 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -373,7 +373,7 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -396,7 +396,7 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -419,7 +419,7 @@ function run_triply_periodic_halo_communication_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank

From 080db7400f0da654d4782825b10087280775a978 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 16:29:21 -0500
Subject: [PATCH 066/100] MPI incompressible turbulence!

---
 src/Distributed/halo_communication.jl |  4 +-
 src/Distributed/mpi_turbulence.jl     | 88 +++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 src/Distributed/mpi_turbulence.jl

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d4515a7de0..eba4fde6cd 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -58,10 +58,10 @@ end
 ##### Filling halos for halo communication boundary conditions
 #####
 
-fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch::AbstractMultiArchitecture, args...) where {LX, LY, LZ} =
+fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch, args...) where {LX, LY, LZ} =
     fill_halo_regions!(field.data, field.boundary_conditions, arch, field.grid, (LX, LY, LZ), args...)
 
-function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, c_location, args...)
+function fill_halo_regions!(c::AbstractArray, bcs, arch, grid, c_location, args...)
 
     barrier = Event(device(child_architecture(arch)))
 
diff --git a/src/Distributed/mpi_turbulence.jl b/src/Distributed/mpi_turbulence.jl
new file mode 100644
index 0000000000..eab5a3d321
--- /dev/null
+++ b/src/Distributed/mpi_turbulence.jl
@@ -0,0 +1,88 @@
+include("distributed_model.jl")
+
+using Statistics
+
+using Oceananigans.Fields
+using Oceananigans.OutputWriters
+using Oceananigans.AbstractOperations
+
+using Oceananigans.Solvers: calculate_pressure_right_hand_side!, copy_pressure!
+
+import Oceananigans.Solvers: solve_for_pressure!
+
+child_architecture(::CPU) = CPU()
+
+function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver, arch, grid, Δt, U★)
+
+    RHS = first(solver.storage)
+
+    rhs_event = launch!(arch, grid, :xyz,
+                        calculate_pressure_right_hand_side!, RHS, arch, grid, Δt, U★,
+                        dependencies = Event(device(arch)))
+
+    wait(device(arch), rhs_event)
+
+    solve_poisson_equation!(solver)
+
+    ϕ = first(solver.storage)
+
+    copy_event = launch!(arch, grid, :xyz,
+                         copy_pressure!, pressure, ϕ, arch, grid,
+                         dependencies = Event(device(arch)))
+
+    wait(device(arch), copy_event)
+
+    return nothing
+end
+
+topo = (Periodic, Periodic, Periodic)
+full_grid = RegularCartesianGrid(topology=topo, size=(128, 128, 1), extent=(2π, 2π, 1))
+arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+dm = DistributedModel(architecture=arch, grid=full_grid, closure=IsotropicDiffusivity(ν=1e-3))
+
+model = dm.model
+u₀ = rand(size(model.grid)...)
+u₀ .-= mean(u₀)
+set!(model, u=0.01u₀, v=0.01u₀)
+
+# [time_step!(model, 0.1) for _ in 1:10]
+
+progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
+simulation = Simulation(model, Δt=0.1, stop_time=50, iteration_interval=1, progress=progress)
+
+u, v, w = model.velocities
+outputs = (ζ=ComputedField(∂x(v) - ∂y(u)),)
+simulation.output_writers[:fields] = NetCDFOutputWriter(model, outputs, filepath="mpi_turbulence_rank$(arch.my_rank).nc", schedule=IterationInterval(1))
+
+run!(simulation)
+
+using Printf
+using NCDatasets
+using CairoMakie
+
+if arch.my_rank == 0
+    ranks = prod(arch.ranks)
+
+    ds = [NCDataset("mpi_turbulence_rank$r.nc") for r in 0:ranks-1]
+
+    frame = Node(1)
+    title = @lift @sprintf("MPI turbulence t = %.2f", ds[1]["time"][$frame])
+    ζ = [@lift ds[r]["ζ"][:, :, 1, $frame] for r in 1:ranks]
+
+    fig = Figure(resolution=(1600, 1600))
+
+    for r in 1:ranks
+        ax = fig[0, 1] = Axis(fig, title="rank $r") # , xlabel="x", ylabel="y")
+        hm = CairoMakie.heatmap!(ax, ds[r]["xC"], ds[r]["yC"], ζ[r], colormap=:balance, colorrange=(-0.01, 0.01))
+        r == ranks && (cb1 = fig[:, 2] = Colorbar(fig, hm, width=30))
+    end
+
+    supertitle = fig[0, :] = Label(fig, title, textsize=30)
+
+    record(fig, "mpi_turbulence.mp4", 1:10:length(ds[1]["time"]), framerate=15) do n
+        @info "Animating MPI turbulence $var frame $n/$(length(ds[1]["time"]))..."
+        frame[] = n
+    end
+
+    [close(d) for d in ds]
+end

From 631d7efa4ad66cbaff3576762ef48489e518e293 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 17:50:42 -0500
Subject: [PATCH 067/100] Gotta communicate pressure halos

---
 src/Distributed/distributed_architectures.jl | 2 +-
 src/Distributed/distributed_model.jl         | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/distributed_architectures.jl b/src/Distributed/distributed_architectures.jl
index 796ef3a708..848aae5706 100644
--- a/src/Distributed/distributed_architectures.jl
+++ b/src/Distributed/distributed_architectures.jl
@@ -1,6 +1,6 @@
 using Oceananigans.Architectures
 
-using Oceananigans.Grids: validate_tupled_argument
+using Oceananigans.Grids: topology, validate_tupled_argument
 
 # TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
 #       Might have to make it MultiCPU(; grid, ranks)
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 4d0b93e0e3..04c7c94d0d 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -77,11 +77,19 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     pressure_solver = haskey(model_kwargs, :pressure_solver) ? Dict(model_kwargs)[:pressure_solver] :
                                                                DistributedFFTBasedPoissonSolver(architecture, grid, my_grid)
 
+    p_bcs = PressureBoundaryConditions(grid)
+    p_bcs = inject_halo_communication_boundary_conditions(p_bcs, my_rank, my_connectivity)
+
+    pHY′ = CenterField(child_architecture(architecture), my_grid, p_bcs)
+    pNHS = CenterField(child_architecture(architecture), my_grid, p_bcs)
+    pressures = (pHY′=pHY′, pNHS=pNHS)
+
     my_model = IncompressibleModel(;
                architecture = child_architecture(architecture),
                        grid = my_grid,
         boundary_conditions = communicative_bcs,
             pressure_solver = pressure_solver,
+                  pressures = pressures,
         model_kwargs...
     )
 

From c6b92eb397b0ca28730c28a98800da0832150b06 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 19:43:45 -0500
Subject: [PATCH 068/100] Beautiful MPI turbulence

---
 src/Distributed/distributed_model.jl    |  4 +-
 src/Distributed/mpi_turbulence.jl       | 61 +++++++++++++++++--------
 src/Distributed/test_distributed_mpi.jl |  1 +
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 04c7c94d0d..0570918522 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -3,6 +3,8 @@ using MPI
 using Oceananigans
 using Oceananigans.Grids
 
+using Oceananigans.Grids: halo_size
+
 include("distributed_utils.jl")
 include("distributed_architectures.jl")
 include("halo_communication_bcs.jl")
@@ -48,7 +50,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
     # FIXME: local grid might have different topology!
-    my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂))
+    my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂), halo=halo_size(grid))
 
     ## Change appropriate boundary conditions to halo communication BCs
 
diff --git a/src/Distributed/mpi_turbulence.jl b/src/Distributed/mpi_turbulence.jl
index eab5a3d321..296a4361ac 100644
--- a/src/Distributed/mpi_turbulence.jl
+++ b/src/Distributed/mpi_turbulence.jl
@@ -1,10 +1,16 @@
 include("distributed_model.jl")
 
+using MPI
+
+MPI.Initialized() || MPI.Init()
+
 using Statistics
 
+using Oceananigans.Advection
 using Oceananigans.Fields
 using Oceananigans.OutputWriters
 using Oceananigans.AbstractOperations
+using Oceananigans.Utils
 
 using Oceananigans.Solvers: calculate_pressure_right_hand_side!, copy_pressure!
 
@@ -36,23 +42,30 @@ function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver,
 end
 
 topo = (Periodic, Periodic, Periodic)
-full_grid = RegularCartesianGrid(topology=topo, size=(128, 128, 1), extent=(2π, 2π, 1))
+full_grid = RegularCartesianGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
 arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-dm = DistributedModel(architecture=arch, grid=full_grid, closure=IsotropicDiffusivity(ν=1e-3))
 
-model = dm.model
-u₀ = rand(size(model.grid)...)
-u₀ .-= mean(u₀)
-set!(model, u=0.01u₀, v=0.01u₀)
+dm = DistributedModel(
+    architecture = arch,
+            grid = full_grid,
+     timestepper = :RungeKutta3,
+       advection = WENO5(),
+         closure = IsotropicDiffusivity(ν=1e-5)
+)
 
-# [time_step!(model, 0.1) for _ in 1:10]
+model = dm.model
+u₀ = rand(size(model.grid)...);
+u₀ .-= mean(u₀);
+set!(model, u=u₀, v=u₀)
 
 progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
-simulation = Simulation(model, Δt=0.1, stop_time=50, iteration_interval=1, progress=progress)
+simulation = Simulation(model, Δt=0.05, stop_time=50, iteration_interval=1, progress=progress)
 
 u, v, w = model.velocities
 outputs = (ζ=ComputedField(∂x(v) - ∂y(u)),)
-simulation.output_writers[:fields] = NetCDFOutputWriter(model, outputs, filepath="mpi_turbulence_rank$(arch.my_rank).nc", schedule=IterationInterval(1))
+simulation.output_writers[:fields] = NetCDFOutputWriter(model, outputs, filepath="mpi_turbulence_rank$(arch.my_rank).nc", schedule=TimeInterval(0.1))
+
+MPI.Barrier(MPI.COMM_WORLD)
 
 run!(simulation)
 
@@ -61,26 +74,34 @@ using NCDatasets
 using CairoMakie
 
 if arch.my_rank == 0
-    ranks = prod(arch.ranks)
+    ranks = 4
 
     ds = [NCDataset("mpi_turbulence_rank$r.nc") for r in 0:ranks-1]
 
     frame = Node(1)
-    title = @lift @sprintf("MPI turbulence t = %.2f", ds[1]["time"][$frame])
+    plot_title = @lift @sprintf("Oceananigans.jl + MPI: 2D turbulence t = %.1f", ds[1]["time"][$frame])
     ζ = [@lift ds[r]["ζ"][:, :, 1, $frame] for r in 1:ranks]
 
-    fig = Figure(resolution=(1600, 1600))
-
-    for r in 1:ranks
-        ax = fig[0, 1] = Axis(fig, title="rank $r") # , xlabel="x", ylabel="y")
-        hm = CairoMakie.heatmap!(ax, ds[r]["xC"], ds[r]["yC"], ζ[r], colormap=:balance, colorrange=(-0.01, 0.01))
-        r == ranks && (cb1 = fig[:, 2] = Colorbar(fig, hm, width=30))
+    fig = Figure(resolution=(1600, 1200))
+
+    for r in reverse(1:ranks)
+        ax = fig[ranks-r+1, 1] = Axis(fig, ylabel="rank $(r-1)", xticks = MultiplesTicks(9, pi, "π"),  yticks = MultiplesTicks(3, pi, "π"))
+        hm = CairoMakie.heatmap!(ax, ds[r]["xF"], ds[r]["yF"], ζ[r], colormap=:balance, colorrange=(-2, 2))
+        r > 1 && hidexdecorations!(ax, grid=false)
+        if r == 1
+            cb = fig[:, 2] = Colorbar(fig, hm, label = "Vorticity ζ = ∂x(v) - ∂y(u)", width=30)
+            cb.height = Relative(2/3)
+        end
+        xlims!(ax, [0, 4π])
+        ylims!(ax, [(r-1)*π, r*π])
     end
 
-    supertitle = fig[0, :] = Label(fig, title, textsize=30)
+    supertitle = fig[0, :] = Label(fig, plot_title, textsize=30)
+
+    trim!(fig.layout)
 
-    record(fig, "mpi_turbulence.mp4", 1:10:length(ds[1]["time"]), framerate=15) do n
-        @info "Animating MPI turbulence $var frame $n/$(length(ds[1]["time"]))..."
+    record(fig, "mpi_turbulence.mp4", 1:length(ds[1]["time"])-1, framerate=30) do n
+        @info "Animating MPI turbulence frame $n/$(length(ds[1]["time"]))..."
         frame[] = n
     end
 
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index 6af7f94dea..99b9a38708 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -461,6 +461,7 @@ end
         run_triply_periodic_local_grid_tests_with_221_ranks()
     end
 
+    # Test pressure bcs!
     @testset "Injection of halo communication BCs" begin
         @info "  Testing injection of halo communication BCs..."
         run_triply_periodic_bc_injection_tests_with_411_ranks()

From eddbdb0bd6a2f155e2e5b8ca9936defffb4bc136 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Sat, 6 Feb 2021 20:45:18 -0500
Subject: [PATCH 069/100] Fix bug in grid used for distributed pressure BCs

---
 src/Distributed/distributed_model.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 0570918522..f712d3c00b 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -79,7 +79,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     pressure_solver = haskey(model_kwargs, :pressure_solver) ? Dict(model_kwargs)[:pressure_solver] :
                                                                DistributedFFTBasedPoissonSolver(architecture, grid, my_grid)
 
-    p_bcs = PressureBoundaryConditions(grid)
+    p_bcs = PressureBoundaryConditions(my_grid)
     p_bcs = inject_halo_communication_boundary_conditions(p_bcs, my_rank, my_connectivity)
 
     pHY′ = CenterField(child_architecture(architecture), my_grid, p_bcs)

From 72cb4e29bef83b92fc7a8786199d862438a5a849 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Thu, 25 Feb 2021 13:43:31 -0500
Subject: [PATCH 070/100] Make it easier to run MPI tests

---
 src/Distributed/distributed_architectures.jl | 1 +
 src/Distributed/test_distributed_mpi.jl      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/Distributed/distributed_architectures.jl b/src/Distributed/distributed_architectures.jl
index 848aae5706..0b6cc114e6 100644
--- a/src/Distributed/distributed_architectures.jl
+++ b/src/Distributed/distributed_architectures.jl
@@ -15,6 +15,7 @@ struct MultiCPU{R, I, ρ, C} <: AbstractMultiArchitecture
 end
 
 child_architecture(::MultiCPU) = CPU()
+child_architecture(::CPU) = CPU()
 
 #####
 ##### Converting between index and MPI rank taking k as the fast index
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index 99b9a38708..4320f68390 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -7,6 +7,8 @@ using Oceananigans.BoundaryConditions: fill_halo_regions!
 MPI.Initialized() || MPI.Init()
 comm = MPI.COMM_WORLD
 
+include("distributed_model.jl")
+
 # Right now just testing with 4 ranks!
 mpi_ranks = MPI.Comm_size(comm)
 @assert mpi_ranks == 4

From 865164e5f7bc9d43ff0508602cd8991e68436a8b Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 13:16:06 -0500
Subject: [PATCH 071/100] Add MPI.jl and PencilFFTs.jl as dependencies

---
 Manifest.toml | 202 +++++++++++++++++++++++++++++++++++++-------------
 Project.toml  |   2 +
 2 files changed, 154 insertions(+), 50 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 5b65c6b50b..4d3e9d6205 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,9 +2,9 @@
 
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716"
+git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "0.5.0"
+version = "1.0.1"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
@@ -12,11 +12,17 @@ git-tree-sha1 = "ffcfa2d345aaee0ef3d8346a073d5dd03c983ebe"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 version = "3.2.0"
 
+[[ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+
+[[ArrayInterface]]
+deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
+git-tree-sha1 = "e7edcc1ac140cce87b7442ff0fa88b5f19fb71fa"
+uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+version = "3.1.3"
+
 [[Artifacts]]
-deps = ["Pkg"]
-git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744"
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-version = "1.3.0"
 
 [[BFloat16s]]
 deps = ["LinearAlgebra", "Test"]
@@ -39,10 +45,10 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.1"
 
 [[CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "6ccc73b2d8b671f7a65c92b5f08f81422ebb7547"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "NNlib", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "2d90e6c29706856928f02e11ae15e71889905e34"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "2.4.1"
+version = "2.6.1"
 
 [[Cassette]]
 git-tree-sha1 = "9cc225870ec32ce7b9c773d4dcdaef32f622cf89"
@@ -68,10 +74,8 @@ uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
 version = "3.25.0"
 
 [[CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "8e695f735fca77e9708e795eda62afdb869cbb70"
+deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.3.4+0"
 
 [[Crayons]]
 git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d"
@@ -106,6 +110,16 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
+[[DocStringExtensions]]
+deps = ["LibGit2", "Markdown", "Pkg", "Test"]
+git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.8.3"
+
+[[Downloads]]
+deps = ["ArgTools", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+
 [[ExprTools]]
 git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -113,9 +127,9 @@ version = "0.1.3"
 
 [[FFTW]]
 deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"]
-git-tree-sha1 = "8fda0934cb99db617171f7296dc361f4d6fa5424"
+git-tree-sha1 = "1b48dbde42f307e48685fa9213d8b9f8c0d87594"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.3.0"
+version = "1.3.2"
 
 [[FFTW_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -130,10 +144,10 @@ uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "6.2.0"
 
 [[GPUCompiler]]
-deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "c853c810b52a80f9aad79ab109207889e57f41ef"
+deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "ef2839b063e158672583b9c09d2cf4876a8d3d55"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.8.3"
+version = "0.10.0"
 
 [[Glob]]
 git-tree-sha1 = "4df9f7e06108728ebf00a0a11edee4b29a482bb2"
@@ -146,6 +160,11 @@ git-tree-sha1 = "fd83fa0bde42e01952757f01149dd968c06c4dba"
 uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
 version = "1.12.0+1"
 
+[[IfElse]]
+git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef"
+uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
+version = "0.1.0"
+
 [[IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "d979e54b71da82f3a65b62553da4fc3d18c9004c"
@@ -163,15 +182,21 @@ version = "1.0.0"
 
 [[JLD2]]
 deps = ["CodecZlib", "DataStructures", "MacroTools", "Mmap", "Pkg", "Printf", "Requires", "UUIDs"]
-git-tree-sha1 = "bb9a457481adf060ab5898823a49d4f854ff4ddd"
+git-tree-sha1 = "b8343a7f96591404ade118b3a7014e1a52062465"
 uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.0"
+version = "0.4.2"
 
 [[JLLWrappers]]
 git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
 version = "1.2.0"
 
+[[JSON3]]
+deps = ["Dates", "Mmap", "Parsers", "StructTypes", "UUIDs"]
+git-tree-sha1 = "62d4063c67d7c84d5788107878bb925ceaadd252"
+uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+version = "1.7.1"
+
 [[KernelAbstractions]]
 deps = ["Adapt", "CUDA", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
 git-tree-sha1 = "ee7f03c23d874c8353813a44315daf82a1e82046"
@@ -184,21 +209,25 @@ git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "3.6.0"
 
+[[LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
+[[LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+
 [[LibCURL_jll]]
-deps = ["LibSSH2_jll", "Libdl", "MbedTLS_jll", "Pkg", "Zlib_jll", "nghttp2_jll"]
-git-tree-sha1 = "897d962c20031e6012bba7b3dcb7a667170dad17"
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "7.70.0+2"
 
 [[LibGit2]]
-deps = ["Printf"]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[LibSSH2_jll]]
-deps = ["Libdl", "MbedTLS_jll", "Pkg"]
-git-tree-sha1 = "717705533148132e5466f2924b9a3657b16158e8"
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.9.0+3"
 
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -211,10 +240,22 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[MKL_jll]]
-deps = ["IntelOpenMP_jll", "Libdl", "Pkg"]
-git-tree-sha1 = "eb540ede3aabb8284cb482aa41d00d6ca850b1f8"
+deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "c253236b0ed414624b083e6b72bfe891fbd2c7af"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2020.2.254+0"
+version = "2021.1.1+1"
+
+[[MPI]]
+deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "Pkg", "Random", "Requires", "Serialization", "Sockets"]
+git-tree-sha1 = "d3aae0fd4d9e1a09c3e2fc728fbe2522ec6d54bc"
+uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+version = "0.16.1"
+
+[[MPICH_jll]]
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "4d37f1e07b4e2a74462eebf9ee48c626d15ffdac"
+uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
+version = "3.3.2+10"
 
 [[MacroTools]]
 deps = ["Markdown", "Random"]
@@ -227,14 +268,27 @@ deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
 [[MbedTLS_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6"
+deps = ["Artifacts", "Libdl"]
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.16.8+1"
+
+[[Memoize]]
+deps = ["MacroTools"]
+git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
+uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
+version = "0.4.4"
+
+[[MicrosoftMPI_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "e5c90234b3967684c9c6f87b4a54549b4ce21836"
+uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
+version = "10.1.3+0"
 
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
+[[MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+
 [[NCDatasets]]
 deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "Printf"]
 git-tree-sha1 = "b71d83c87d80f5c54c55a7a9a3aa42bf931c72aa"
@@ -249,15 +303,24 @@ version = "0.7.14"
 
 [[NetCDF_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Pkg", "Zlib_jll", "nghttp2_jll"]
-git-tree-sha1 = "d5835f95aea3b93965a1a7c06de9aace8cb82d99"
+git-tree-sha1 = "0cf4d1bf2ef45156aed85c9ac5f8c7e697d9288c"
 uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
-version = "400.701.400+0"
+version = "400.702.400+0"
+
+[[NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
 
 [[OffsetArrays]]
 deps = ["Adapt"]
-git-tree-sha1 = "76622f08645764e040b4d7e86d0ff471fd126ae4"
+git-tree-sha1 = "b3dfef5f2be7d7eb0e782ba9146a5271ee426e90"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.5.3"
+version = "1.6.2"
+
+[[OpenMPI_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "41b983e26a7ab8c9bf05f7d70c274b817d541b46"
+uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
+version = "4.0.2+2"
 
 [[OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -272,12 +335,30 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.3+4"
 
 [[OrderedCollections]]
-git-tree-sha1 = "d45739abcfc03b51f6a42712894a593f74c80a23"
+git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.3.3"
+version = "1.4.0"
+
+[[Parsers]]
+deps = ["Dates"]
+git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "1.0.16"
+
+[[PencilArrays]]
+deps = ["ArrayInterface", "JSON3", "Libdl", "LinearAlgebra", "MPI", "OffsetArrays", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "TimerOutputs"]
+git-tree-sha1 = "6921d07316f41e2be5befd8b815eee28d3fab9f8"
+uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
+version = "0.9.0"
+
+[[PencilFFTs]]
+deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
+git-tree-sha1 = "a7665838a566accd7d9cf308bbb497126dc5edf4"
+uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
+version = "0.12.1"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -285,7 +366,7 @@ deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
 [[Random]]
@@ -340,9 +421,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[SpecialFunctions]]
 deps = ["ChainRulesCore", "OpenSpecFun_jll"]
-git-tree-sha1 = "75394dbe2bd346beeed750fb02baa6445487b862"
+git-tree-sha1 = "5919936c0e92cff40e57d0ddf0ceb667d42e5902"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "1.2.1"
+version = "1.3.0"
+
+[[Static]]
+deps = ["IfElse"]
+git-tree-sha1 = "98ace568bf638e89eac33c99337f3c8c6e2227b8"
+uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
+version = "0.2.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
@@ -350,6 +437,11 @@ git-tree-sha1 = "9da72ed50e94dbff92036da395275ed114e04d49"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
 version = "1.0.1"
 
+[[StaticPermutations]]
+git-tree-sha1 = "193c3daa18ff3e55c1dae66acb6a762c4a3bdb0b"
+uuid = "15972242-4b8f-49a0-b8a1-9ac0e7a1a45d"
+version = "0.3.0"
+
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -360,6 +452,16 @@ git-tree-sha1 = "26ea43b4be7e919a2390c3c0f824e7eb4fc19a0a"
 uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 version = "0.5.0"
 
+[[StructTypes]]
+deps = ["Dates", "UUIDs"]
+git-tree-sha1 = "d7f4287dbc1e590265f50ceda1b40ed2bb31bbbb"
+uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
+version = "1.4.0"
+
+[[TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+
 [[TableTraits]]
 deps = ["IteratorInterfaceExtensions"]
 git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e"
@@ -372,15 +474,19 @@ git-tree-sha1 = "a716dde43d57fa537a19058d044b495301ba6565"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 version = "1.3.2"
 
+[[Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+
 [[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "3318281dd4121ecf9713ce1383b9ace7d7476fdd"
+git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.7"
+version = "0.5.8"
 
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
@@ -396,13 +502,9 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[Zlib_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "320228915c8debb12cb434c59057290f0834dbf6"
+deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.11+18"
 
 [[nghttp2_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "8e2c44ab4d49ad9518f359ed8b62f83ba8beede4"
+deps = ["Artifacts", "Libdl"]
 uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.40.0+2"
diff --git a/Project.toml b/Project.toml
index d67e8d9b83..400c8bd74f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,9 +14,11 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+PencilFFTs = "4a48f351-57a6-4416-9ec4-c37015456aae"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

From efe89da39e224c51e99545bfb0c0f9d6bffd02ba Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 13:17:32 -0500
Subject: [PATCH 072/100] New `Distributed` sub-module

---
 src/Distributed/Distributed.jl                  | 16 ++++++++++++++++
 .../distributed_fft_based_poisson_solver.jl     | 17 ++++-------------
 src/Distributed/distributed_model.jl            |  7 -------
 src/Distributed/halo_communication_bcs.jl       |  5 +++--
 ..._architectures.jl => multi_architectures.jl} |  0
 src/Oceananigans.jl                             |  5 +++++
 6 files changed, 28 insertions(+), 22 deletions(-)
 create mode 100644 src/Distributed/Distributed.jl
 rename src/Distributed/{distributed_architectures.jl => multi_architectures.jl} (100%)

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
new file mode 100644
index 0000000000..42d77c19c8
--- /dev/null
+++ b/src/Distributed/Distributed.jl
@@ -0,0 +1,16 @@
+module Distributed
+
+export
+    MultiCPU,
+    HaloCommunication, HaloCommunicationBC,
+    DistributedFFTBasedPoissonSolver,
+    DistributedModel
+
+include("distributed_utils.jl")
+include("multi_architectures.jl")
+include("halo_communication_bcs.jl")
+include("halo_communication.jl")
+include("distributed_fft_based_poisson_solver.jl")
+include("distributed_model.jl")
+
+end # module
diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index b6d8399cf0..5595e1b35c 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -1,6 +1,6 @@
-using PencilFFTs
+import PencilFFTs
 
-import Oceananigans.Solvers: solve_poisson_equation!
+import Oceananigans.Solvers: poisson_eigenvalues, solve_poisson_equation!
 
 struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S}
               plan :: P
@@ -10,15 +10,6 @@ struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S}
            storage :: S
 end
 
-reshaped_size(N, dim) = dim == 1 ? (N, 1, 1) :
-                        dim == 2 ? (1, N, 1) :
-                        dim == 3 ? (1, 1, N) : nothing
-
-function poisson_eigenvalues(N, L, dim, ::Periodic)
-    inds = reshape(1:N, reshaped_size(N, dim)...)
-    return @. (2sin((inds - 1) * π / N) / (L / N))^2
-end
-
 function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
     topo = (TX, TY, TZ) = topology(full_grid)
 
@@ -33,8 +24,8 @@ function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
 
     transform = PencilFFTs.Transforms.FFT!()
     proc_dims = (arch.ranks[2], arch.ranks[3])
-    plan = PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
-    storage = allocate_input(plan)
+    plan = PencilFFTs.PencilFFTPlan(size(full_grid), transform, proc_dims, MPI.COMM_WORLD)
+    storage = PencilFFTs.allocate_input(plan)
 
     return DistributedFFTBasedPoissonSolver(plan, full_grid, local_grid, eigenvalues, storage)
 end
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index f712d3c00b..6feebaf4fe 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -2,15 +2,8 @@ using MPI
 
 using Oceananigans
 using Oceananigans.Grids
-
 using Oceananigans.Grids: halo_size
 
-include("distributed_utils.jl")
-include("distributed_architectures.jl")
-include("halo_communication_bcs.jl")
-include("halo_communication.jl")
-include("distributed_fft_based_poisson_solver.jl")
-
 #####
 ##### Distributed model struct and constructor
 #####
diff --git a/src/Distributed/halo_communication_bcs.jl b/src/Distributed/halo_communication_bcs.jl
index aa21385545..ca72f7c485 100644
--- a/src/Distributed/halo_communication_bcs.jl
+++ b/src/Distributed/halo_communication_bcs.jl
@@ -1,10 +1,11 @@
+using Oceananigans.BoundaryConditions
 using Oceananigans.BoundaryConditions: BCType
 
 import Oceananigans.BoundaryConditions: bctype_str, print_condition
 
 struct HaloCommunication <: BCType end
 
-HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
+const HaloCommunicationBC = BoundaryCondition{<:HaloCommunication}
 
 bctype_str(::HaloCommunicationBC) ="HaloCommunication"
 
@@ -51,4 +52,4 @@ function inject_halo_communication_boundary_conditions(field_bcs, my_rank, conne
                                          isnothing(rank_top) ? field_bcs.top : top_comm_bc)
 
     return FieldBoundaryConditions(x_bcs, y_bcs, z_bcs)
-end
\ No newline at end of file
+end
diff --git a/src/Distributed/distributed_architectures.jl b/src/Distributed/multi_architectures.jl
similarity index 100%
rename from src/Distributed/distributed_architectures.jl
rename to src/Distributed/multi_architectures.jl
diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl
index 4ca575c376..554409df70 100644
--- a/src/Oceananigans.jl
+++ b/src/Oceananigans.jl
@@ -75,6 +75,9 @@ export
     # Abstract operations
     ∂x, ∂y, ∂z, @at,
 
+    # Distributed
+    MultiCPU,
+
     # Utils
     prettytime
 
@@ -165,6 +168,7 @@ include("Diagnostics/Diagnostics.jl")
 include("OutputWriters/OutputWriters.jl")
 include("Simulations/Simulations.jl")
 include("AbstractOperations/AbstractOperations.jl")
+include("Distributed/Distributed.jl")
 
 #####
 ##### Needed so we can export names from sub-modules at the top-level
@@ -190,6 +194,7 @@ using .Diagnostics
 using .OutputWriters
 using .Simulations
 using .AbstractOperations
+using .Distributed
 
 function __init__()
     threads = Threads.nthreads()

From e75209aff981017db11de3b8702a9fe6174ca974 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 13:48:56 -0500
Subject: [PATCH 073/100] Move MPI tests into `test` directory

---
 src/Distributed/Distributed.jl                |  3 +-
 src/Distributed/distributed_model.jl          |  2 +-
 src/Distributed/mpi_turbulence.jl             |  2 +-
 src/Distributed/multi_architectures.jl        |  4 +--
 src/Distributed/test_distributed_mpi.jl       | 32 +++++++++----------
 test/runtests.jl                              | 22 ++++++++-----
 .../test_distributed_poisson_solvers.jl       | 21 ++----------
 7 files changed, 38 insertions(+), 48 deletions(-)
 rename {src/Distributed => test}/test_distributed_poisson_solvers.jl (81%)

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 42d77c19c8..a6b74188e9 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -1,8 +1,9 @@
 module Distributed
 
 export
-    MultiCPU,
+    MultiCPU, child_architecture,
     HaloCommunication, HaloCommunicationBC,
+    inject_halo_communication_boundary_conditions,
     DistributedFFTBasedPoissonSolver,
     DistributedModel
 
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_model.jl
index 6feebaf4fe..e5d263275d 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_model.jl
@@ -43,7 +43,7 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
     # FIXME: local grid might have different topology!
-    my_grid = RegularCartesianGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂), halo=halo_size(grid))
+    my_grid = RegularRectilinearGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂), halo=halo_size(grid))
 
     ## Change appropriate boundary conditions to halo communication BCs
 
diff --git a/src/Distributed/mpi_turbulence.jl b/src/Distributed/mpi_turbulence.jl
index 296a4361ac..aaebb6dc5d 100644
--- a/src/Distributed/mpi_turbulence.jl
+++ b/src/Distributed/mpi_turbulence.jl
@@ -42,7 +42,7 @@ function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver,
 end
 
 topo = (Periodic, Periodic, Periodic)
-full_grid = RegularCartesianGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
+full_grid = RegularRectilinearGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
 arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
 
 dm = DistributedModel(
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 0b6cc114e6..6a92003bb3 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -117,9 +117,7 @@ function MultiCPU(; grid, ranks)
 
     if total_ranks != mpi_ranks
         throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
-                            "with number of MPI ranks: $mpi_ranks. Exiting with return code 1."))
-        MPI.Finalize()
-        exit(code=1)
+                            "with number of MPI ranks: $mpi_ranks."))
     end
 
     comm = MPI.COMM_WORLD
diff --git a/src/Distributed/test_distributed_mpi.jl b/src/Distributed/test_distributed_mpi.jl
index 4320f68390..4780ff253a 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/src/Distributed/test_distributed_mpi.jl
@@ -19,7 +19,7 @@ mpi_ranks = MPI.Comm_size(comm)
 
 function run_triply_periodic_rank_connectivity_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -56,7 +56,7 @@ end
 
 function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -99,7 +99,7 @@ end
 
 function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -145,7 +145,7 @@ end
 
 function run_triply_periodic_rank_connectivity_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -194,7 +194,7 @@ end
 
 function run_triply_periodic_local_grid_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -214,7 +214,7 @@ end
 
 function run_triply_periodic_local_grid_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -234,7 +234,7 @@ end
 
 function run_triply_periodic_local_grid_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -254,7 +254,7 @@ end
 
 function run_triply_periodic_local_grid_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -278,7 +278,7 @@ end
 
 function run_triply_periodic_bc_injection_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -295,7 +295,7 @@ end
 
 function run_triply_periodic_bc_injection_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -312,7 +312,7 @@ end
 
 function run_triply_periodic_bc_injection_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -329,7 +329,7 @@ end
 
 function run_triply_periodic_bc_injection_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -350,7 +350,7 @@ end
 
 function run_triply_periodic_halo_communication_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -373,7 +373,7 @@ end
 
 function run_triply_periodic_halo_communication_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -396,7 +396,7 @@ end
 
 function run_triply_periodic_halo_communication_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
@@ -419,7 +419,7 @@ end
 
 function run_triply_periodic_halo_communication_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 5bd1027752..ea294bfabc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using LinearAlgebra
 using Logging
 
 using CUDA
+using MPI
 using JLD2
 using FFTW
 using OffsetArrays
@@ -118,6 +119,14 @@ group = get(ENV, "TEST_GROUP", :all) |> Symbol
         end
     end
 
+    if group == :shallow_water || group == :all
+        include("test_shallow_water_models.jl")
+    end
+
+    if group == :hydrostatic_free_surface || group == :all
+        include("test_hydrostatic_free_surface_models.jl")
+    end
+
     if group == :simulation || group == :all
         @testset "Simulation tests" begin
             include("test_simulations.jl")
@@ -128,6 +137,11 @@ group = get(ENV, "TEST_GROUP", :all) |> Symbol
         end
     end
 
+    if group == :distributed || group == :all
+        MPI.Initialized() || MPI.Init()
+        include("test_distributed_poisson_solvers.jl")
+    end
+
     if group == :regression || group == :all
         include("test_regression.jl")
     end
@@ -141,12 +155,4 @@ group = get(ENV, "TEST_GROUP", :all) |> Symbol
     if group == :convergence
         include("test_convergence.jl")
     end
-
-    if group == :shallow_water || group == :all
-        include("test_shallow_water_models.jl")
-    end
-
-    if group == :hydrostatic_free_surface || group == :all
-        include("test_hydrostatic_free_surface_models.jl")
-    end
 end
diff --git a/src/Distributed/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
similarity index 81%
rename from src/Distributed/test_distributed_poisson_solvers.jl
rename to test/test_distributed_poisson_solvers.jl
index 0feae84a48..29621bda19 100644
--- a/src/Distributed/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -1,21 +1,6 @@
-using Test
-using Oceananigans
-using Oceananigans.Architectures
-using Oceananigans.Solvers
-using Oceananigans.Utils
-using Oceananigans.Operators
-using Oceananigans.BoundaryConditions: fill_halo_regions!
-using KernelAbstractions: @kernel, @index, Event
-
-@kernel function ∇²!(grid, f, ∇²f)
-    i, j, k = @index(Global, NTuple)
-    @inbounds ∇²f[i, j, k] = ∇²(i, j, k, grid, f)
-end
 
-@kernel function divergence!(grid, u, v, w, div)
-    i, j, k = @index(Global, NTuple)
-    @inbounds div[i, j, k] = divᶜᶜᶜ(i, j, k, grid, u, v, w)
-end
+using Oceananigans
+using Oceananigans.Distributed
 
 function random_divergent_source_term(FT, arch, grid)
     # Generate right hand side from a random (divergent) velocity field.
@@ -55,7 +40,7 @@ end
 
 function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularCartesianGrid(topology=topo, size=grid_points, extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=grid_points, extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=ranks)
     dm = DistributedModel(architecture=arch, grid=full_grid)
 

From 0356e0dae98a0d64e66c1c5d45bc7970389cf3f5 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 14:25:27 -0500
Subject: [PATCH 074/100] MPI tests passing locally

---
 src/Distributed/multi_architectures.jl             |  2 --
 test/runtests.jl                                   |  2 ++
 .../test_distributed_models.jl                     | 14 ++------------
 test/test_distributed_poisson_solvers.jl           |  3 ---
 4 files changed, 4 insertions(+), 17 deletions(-)
 rename src/Distributed/test_distributed_mpi.jl => test/test_distributed_models.jl (98%)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 6a92003bb3..4e71da67da 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -120,8 +120,6 @@ function MultiCPU(; grid, ranks)
                             "with number of MPI ranks: $mpi_ranks."))
     end
 
-    comm = MPI.COMM_WORLD
-
     my_connectivity = RankConnectivity(my_index, ranks, topology(grid))
 
     return MultiCPU(my_rank, my_index, ranks, my_connectivity)
diff --git a/test/runtests.jl b/test/runtests.jl
index ea294bfabc..14917ca304 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -29,6 +29,7 @@ using Oceananigans.Diagnostics
 using Oceananigans.OutputWriters
 using Oceananigans.TurbulenceClosures
 using Oceananigans.AbstractOperations
+using Oceananigans.Distributed
 using Oceananigans.Logger
 using Oceananigans.Units
 using Oceananigans.Utils
@@ -139,6 +140,7 @@ group = get(ENV, "TEST_GROUP", :all) |> Symbol
 
     if group == :distributed || group == :all
         MPI.Initialized() || MPI.Init()
+        include("test_distributed_models.jl")
         include("test_distributed_poisson_solvers.jl")
     end
 
diff --git a/src/Distributed/test_distributed_mpi.jl b/test/test_distributed_models.jl
similarity index 98%
rename from src/Distributed/test_distributed_mpi.jl
rename to test/test_distributed_models.jl
index 4780ff253a..6f85ed2d3e 100644
--- a/src/Distributed/test_distributed_mpi.jl
+++ b/test/test_distributed_models.jl
@@ -1,15 +1,10 @@
-using Test
 using MPI
-using Oceananigans
 
 using Oceananigans.BoundaryConditions: fill_halo_regions!
-
-MPI.Initialized() || MPI.Init()
-comm = MPI.COMM_WORLD
-
-include("distributed_model.jl")
+using Oceananigans.Distributed: index2rank, east_halo, west_halo, north_halo, south_halo, top_halo, bottom_halo
 
 # Right now just testing with 4 ranks!
+comm = MPI.COMM_WORLD
 mpi_ranks = MPI.Comm_size(comm)
 @assert mpi_ranks == 4
 
@@ -480,9 +475,4 @@ end
         run_triply_periodic_halo_communication_tests_with_114_ranks()
         # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
-
-    include("test_distributed_poisson_solvers.jl")
 end
-
-# MPI.Finalize()
-# @test MPI.Finalized()
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 29621bda19..305b144ce4 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -1,7 +1,4 @@
 
-using Oceananigans
-using Oceananigans.Distributed
-
 function random_divergent_source_term(FT, arch, grid)
     # Generate right hand side from a random (divergent) velocity field.
     Ru = CenterField(FT, arch, grid, UVelocityBoundaryConditions(grid))

From bc118999d890751a29b80fff0384d5976424bcf2 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 14:28:41 -0500
Subject: [PATCH 075/100] Need a distributed `solve_for_pressure`

---
 .../Distributed => sandbox}/mpi_turbulence.jl | 29 -------------------
 src/Distributed/Distributed.jl                |  1 +
 .../distributed_solve_for_pressure.jl         | 24 +++++++++++++++
 3 files changed, 25 insertions(+), 29 deletions(-)
 rename {src/Distributed => sandbox}/mpi_turbulence.jl (76%)
 create mode 100644 src/Distributed/distributed_solve_for_pressure.jl

diff --git a/src/Distributed/mpi_turbulence.jl b/sandbox/mpi_turbulence.jl
similarity index 76%
rename from src/Distributed/mpi_turbulence.jl
rename to sandbox/mpi_turbulence.jl
index aaebb6dc5d..1d92fa9f4f 100644
--- a/src/Distributed/mpi_turbulence.jl
+++ b/sandbox/mpi_turbulence.jl
@@ -1,5 +1,3 @@
-include("distributed_model.jl")
-
 using MPI
 
 MPI.Initialized() || MPI.Init()
@@ -14,33 +12,6 @@ using Oceananigans.Utils
 
 using Oceananigans.Solvers: calculate_pressure_right_hand_side!, copy_pressure!
 
-import Oceananigans.Solvers: solve_for_pressure!
-
-child_architecture(::CPU) = CPU()
-
-function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver, arch, grid, Δt, U★)
-
-    RHS = first(solver.storage)
-
-    rhs_event = launch!(arch, grid, :xyz,
-                        calculate_pressure_right_hand_side!, RHS, arch, grid, Δt, U★,
-                        dependencies = Event(device(arch)))
-
-    wait(device(arch), rhs_event)
-
-    solve_poisson_equation!(solver)
-
-    ϕ = first(solver.storage)
-
-    copy_event = launch!(arch, grid, :xyz,
-                         copy_pressure!, pressure, ϕ, arch, grid,
-                         dependencies = Event(device(arch)))
-
-    wait(device(arch), copy_event)
-
-    return nothing
-end
-
 topo = (Periodic, Periodic, Periodic)
 full_grid = RegularRectilinearGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
 arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index a6b74188e9..73b9704054 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -12,6 +12,7 @@ include("multi_architectures.jl")
 include("halo_communication_bcs.jl")
 include("halo_communication.jl")
 include("distributed_fft_based_poisson_solver.jl")
+include("distributed_solve_for_pressure.jl")
 include("distributed_model.jl")
 
 end # module
diff --git a/src/Distributed/distributed_solve_for_pressure.jl b/src/Distributed/distributed_solve_for_pressure.jl
new file mode 100644
index 0000000000..47f56cef49
--- /dev/null
+++ b/src/Distributed/distributed_solve_for_pressure.jl
@@ -0,0 +1,24 @@
+import Oceananigans.Solvers: solve_for_pressure!
+
+function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver, arch, grid, Δt, U★)
+
+    RHS = first(solver.storage)
+
+    rhs_event = launch!(arch, grid, :xyz,
+                        calculate_pressure_right_hand_side!, RHS, arch, grid, Δt, U★,
+                        dependencies = Event(device(arch)))
+
+    wait(device(arch), rhs_event)
+
+    solve_poisson_equation!(solver)
+
+    ϕ = first(solver.storage)
+
+    copy_event = launch!(arch, grid, :xyz,
+                         copy_pressure!, pressure, ϕ, arch, grid,
+                         dependencies = Event(device(arch)))
+
+    wait(device(arch), copy_event)
+
+    return nothing
+end

From bfcf223738fc9a7e13583d811fc2ed6b4b3c70ce Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 14:52:18 -0500
Subject: [PATCH 076/100] Test time stepping and running simulations

---
 test/test_distributed_models.jl | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 6f85ed2d3e..8407bf76e2 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -475,4 +475,21 @@ end
         run_triply_periodic_halo_communication_tests_with_114_ranks()
         # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
+
+    @testset "Time stepping" begin
+        topo = (Periodic, Periodic, Periodic)
+        full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+        arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+        dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+        model = dm.model
+
+        time_step!(model, 1)
+        @test dm isa DistributedModel
+        @test model.clock.time == 1
+
+        simulation = Simulation(model, Δt=1, stop_iteration=2)
+        run!(simulation)
+        @test dm isa DistributedModel
+        @test model.clock.time == 2
+    end
 end

From 703f9e07accf00ab83542e335e5ae377f9396a8c Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 15:01:04 -0500
Subject: [PATCH 077/100] `DistributedModel` ->
 `DistributedIncompressibleModel`

---
 sandbox/mpi_turbulence.jl                     |  2 +-
 src/Distributed/Distributed.jl                |  4 +--
 ...jl => distributed_incompressible_model.jl} | 10 +++----
 test/test_distributed_models.jl               | 30 +++++++++----------
 test/test_distributed_poisson_solvers.jl      |  2 +-
 5 files changed, 24 insertions(+), 24 deletions(-)
 rename src/Distributed/{distributed_model.jl => distributed_incompressible_model.jl} (90%)

diff --git a/sandbox/mpi_turbulence.jl b/sandbox/mpi_turbulence.jl
index 1d92fa9f4f..d5b9824ad7 100644
--- a/sandbox/mpi_turbulence.jl
+++ b/sandbox/mpi_turbulence.jl
@@ -16,7 +16,7 @@ topo = (Periodic, Periodic, Periodic)
 full_grid = RegularRectilinearGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
 arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
 
-dm = DistributedModel(
+dm = DistributedIncompressibleModel(
     architecture = arch,
             grid = full_grid,
      timestepper = :RungeKutta3,
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 73b9704054..a7965b3bca 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -5,7 +5,7 @@ export
     HaloCommunication, HaloCommunicationBC,
     inject_halo_communication_boundary_conditions,
     DistributedFFTBasedPoissonSolver,
-    DistributedModel
+    DistributedIncompressibleModel
 
 include("distributed_utils.jl")
 include("multi_architectures.jl")
@@ -13,6 +13,6 @@ include("halo_communication_bcs.jl")
 include("halo_communication.jl")
 include("distributed_fft_based_poisson_solver.jl")
 include("distributed_solve_for_pressure.jl")
-include("distributed_model.jl")
+include("distributed_incompressible_model.jl")
 
 end # module
diff --git a/src/Distributed/distributed_model.jl b/src/Distributed/distributed_incompressible_model.jl
similarity index 90%
rename from src/Distributed/distributed_model.jl
rename to src/Distributed/distributed_incompressible_model.jl
index e5d263275d..aaff1788ed 100644
--- a/src/Distributed/distributed_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -8,13 +8,13 @@ using Oceananigans.Grids: halo_size
 ##### Distributed model struct and constructor
 #####
 
-struct DistributedModel{A, G, M}
+struct DistributedIncompressibleModel{A, G, M}
     architecture :: A
             grid :: G
            model :: M
 end
 
-function DistributedModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
+function DistributedIncompressibleModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
     my_rank = architecture.my_rank
     i, j, k = architecture.my_index
     Rx, Ry, Rz = architecture.ranks
@@ -88,10 +88,10 @@ function DistributedModel(; architecture, grid, boundary_conditions=nothing, mod
         model_kwargs...
     )
 
-    return DistributedModel(architecture, grid, my_model)
+    return DistributedIncompressibleModel(architecture, grid, my_model)
 end
 
-function Base.show(io::IO, dm::DistributedModel)
-    print(io, "DistributedModel with ")
+function Base.show(io::IO, dm::DistributedIncompressibleModel)
+    print(io, "DistributedIncompressibleModel with ")
     print(io, dm.architecture)
 end
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 8407bf76e2..318bedbf9b 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -191,7 +191,7 @@ function run_triply_periodic_local_grid_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -211,7 +211,7 @@ function run_triply_periodic_local_grid_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -231,7 +231,7 @@ function run_triply_periodic_local_grid_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = dm.model.grid
@@ -251,7 +251,7 @@ function run_triply_periodic_local_grid_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     i, j, k = arch.my_index
     local_grid = dm.model.grid
@@ -275,7 +275,7 @@ function run_triply_periodic_bc_injection_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -292,7 +292,7 @@ function run_triply_periodic_bc_injection_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -309,7 +309,7 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -326,7 +326,7 @@ function run_triply_periodic_bc_injection_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         fbcs = field.boundary_conditions
@@ -347,7 +347,7 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -370,7 +370,7 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -393,7 +393,7 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -416,7 +416,7 @@ function run_triply_periodic_halo_communication_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in fields(dm.model)
         interior(field) .= arch.my_rank
@@ -480,16 +480,16 @@ end
         topo = (Periodic, Periodic, Periodic)
         full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
         arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-        dm = DistributedModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+        dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
         model = dm.model
 
         time_step!(model, 1)
-        @test dm isa DistributedModel
+        @test dm isa DistributedIncompressibleModel
         @test model.clock.time == 1
 
         simulation = Simulation(model, Δt=1, stop_iteration=2)
         run!(simulation)
-        @test dm isa DistributedModel
+        @test dm isa DistributedIncompressibleModel
         @test model.clock.time == 2
     end
 end
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 305b144ce4..c730d72895 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -39,7 +39,7 @@ function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=grid_points, extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=ranks)
-    dm = DistributedModel(architecture=arch, grid=full_grid)
+    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid)
 
     local_grid = dm.model.grid
     solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)

From c1e4c2c556dbbf91f1edd7cc341b6b78af5faa6f Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 15:01:17 -0500
Subject: [PATCH 078/100] Add new Buildkite job for MPI

---
 .buildkite/pipeline.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8ca97b7cc5..b21bedc368 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -240,6 +240,23 @@ steps:
       architecture: CPU
     depends_on: "init_cpu"
 
+#####
+##### Distributed/MPI
+#####
+
+  - label: "🐉 cpu distributed tests"
+    env:
+      JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
+      TEST_GROUP: "distributed"
+      CUDA_VISIBLE_DEVICES: "-1"
+    commands:
+      - "module load mpi/openmpi-x86_64"
+      - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+    agents:
+      queue: Oceananigans
+      architecture: CPU
+    depends_on: "init_cpu"
+
 #####
 ##### Regression
 #####

From 655bb44d7c048cf82c6c741686830bee9cc90236 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 15:08:35 -0500
Subject: [PATCH 079/100] Resolve packages for Julia 1.5

---
 Manifest.toml | 86 +++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 50 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 4d3e9d6205..d071cee7ef 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -12,9 +12,6 @@ git-tree-sha1 = "ffcfa2d345aaee0ef3d8346a073d5dd03c983ebe"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 version = "3.2.0"
 
-[[ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-
 [[ArrayInterface]]
 deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
 git-tree-sha1 = "e7edcc1ac140cce87b7442ff0fa88b5f19fb71fa"
@@ -22,7 +19,10 @@ uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 version = "3.1.3"
 
 [[Artifacts]]
+deps = ["Pkg"]
+git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744"
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+version = "1.3.0"
 
 [[BFloat16s]]
 deps = ["LinearAlgebra", "Test"]
@@ -45,10 +45,10 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.1"
 
 [[CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "NNlib", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "2d90e6c29706856928f02e11ae15e71889905e34"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "6ccc73b2d8b671f7a65c92b5f08f81422ebb7547"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "2.6.1"
+version = "2.4.1"
 
 [[Cassette]]
 git-tree-sha1 = "9cc225870ec32ce7b9c773d4dcdaef32f622cf89"
@@ -74,8 +74,10 @@ uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
 version = "3.25.0"
 
 [[CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "8e695f735fca77e9708e795eda62afdb869cbb70"
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.3.4+0"
 
 [[Crayons]]
 git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d"
@@ -116,10 +118,6 @@ git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 version = "0.8.3"
 
-[[Downloads]]
-deps = ["ArgTools", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-
 [[ExprTools]]
 git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -144,10 +142,10 @@ uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "6.2.0"
 
 [[GPUCompiler]]
-deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "ef2839b063e158672583b9c09d2cf4876a8d3d55"
+deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "c853c810b52a80f9aad79ab109207889e57f41ef"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.10.0"
+version = "0.8.3"
 
 [[Glob]]
 git-tree-sha1 = "4df9f7e06108728ebf00a0a11edee4b29a482bb2"
@@ -210,24 +208,26 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "3.6.0"
 
 [[LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
+deps = ["Pkg"]
+git-tree-sha1 = "4bb5499a1fc437342ea9ab7e319ede5a457c0968"
 uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
-[[LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "1.3.0"
 
 [[LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+deps = ["LibSSH2_jll", "Libdl", "MbedTLS_jll", "Pkg", "Zlib_jll", "nghttp2_jll"]
+git-tree-sha1 = "897d962c20031e6012bba7b3dcb7a667170dad17"
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "7.70.0+2"
 
 [[LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+deps = ["Libdl", "MbedTLS_jll", "Pkg"]
+git-tree-sha1 = "717705533148132e5466f2924b9a3657b16158e8"
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.9.0+3"
 
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -268,14 +268,10 @@ deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
 [[MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6"
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-
-[[Memoize]]
-deps = ["MacroTools"]
-git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
-uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
-version = "0.4.4"
+version = "2.16.8+1"
 
 [[MicrosoftMPI_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -286,9 +282,6 @@ version = "10.1.3+0"
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
-[[MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-
 [[NCDatasets]]
 deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "Printf"]
 git-tree-sha1 = "b71d83c87d80f5c54c55a7a9a3aa42bf931c72aa"
@@ -303,12 +296,9 @@ version = "0.7.14"
 
 [[NetCDF_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Pkg", "Zlib_jll", "nghttp2_jll"]
-git-tree-sha1 = "0cf4d1bf2ef45156aed85c9ac5f8c7e697d9288c"
+git-tree-sha1 = "d5835f95aea3b93965a1a7c06de9aace8cb82d99"
 uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
-version = "400.702.400+0"
-
-[[NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "400.701.400+0"
 
 [[OffsetArrays]]
 deps = ["Adapt"]
@@ -358,7 +348,7 @@ uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
 version = "0.12.1"
 
 [[Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]
@@ -366,7 +356,7 @@ deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
 [[Random]]
@@ -458,10 +448,6 @@ git-tree-sha1 = "d7f4287dbc1e590265f50ceda1b40ed2bb31bbbb"
 uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 version = "1.4.0"
 
-[[TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-
 [[TableTraits]]
 deps = ["IteratorInterfaceExtensions"]
 git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e"
@@ -474,12 +460,8 @@ git-tree-sha1 = "a716dde43d57fa537a19058d044b495301ba6565"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 version = "1.3.2"
 
-[[Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-
 [[Test]]
-deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
@@ -502,9 +484,13 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[Zlib_jll]]
-deps = ["Libdl"]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "320228915c8debb12cb434c59057290f0834dbf6"
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+18"
 
 [[nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "8e2c44ab4d49ad9518f359ed8b62f83ba8beede4"
 uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.40.0+2"

From 8869b3d00bad5c9eab423df9819b0c01251aa7b3 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 15:52:14 -0500
Subject: [PATCH 080/100] Fix dispatch for `fill_halo_regions!`

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index eba4fde6cd..b71693ef68 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -58,7 +58,7 @@ end
 ##### Filling halos for halo communication boundary conditions
 #####
 
-fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch, args...) where {LX, LY, LZ} =
+fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch::AbstractMultiArchitecture, args...) where {LX, LY, LZ} =
     fill_halo_regions!(field.data, field.boundary_conditions, arch, field.grid, (LX, LY, LZ), args...)
 
 function fill_halo_regions!(c::AbstractArray, bcs, arch, grid, c_location, args...)

From 56c9727f202716544e017071f12f54db30909490 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 16:14:37 -0500
Subject: [PATCH 081/100] Buildkite should have access to `mpiexec` now

---
 .buildkite/pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index b21bedc368..edbd31a078 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -250,6 +250,7 @@ steps:
       TEST_GROUP: "distributed"
       CUDA_VISIBLE_DEVICES: "-1"
     commands:
+      - "source /etc/bashrc"  # Needed to get access to the module command.
       - "module load mpi/openmpi-x86_64"
       - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:

From a89d2485661c1b0a8f24bce6aae993b2a3057be4 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 16:23:23 -0500
Subject: [PATCH 082/100] Use system MPI on Buildkite/Tartarus

---
 .buildkite/pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index edbd31a078..115f84172a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -252,6 +252,7 @@ steps:
     commands:
       - "source /etc/bashrc"  # Needed to get access to the module command.
       - "module load mpi/openmpi-x86_64"
+      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -e 'ENV["JULIA_MPI_BINARY"]="system"; using Pkg; Pkg.add("MPI"); Pkg.build("MPI"; verbose=true)'"
       - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans

From 4307d82def5c8d5e39084f70c3d34ef96ad790ce Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 16:26:36 -0500
Subject: [PATCH 083/100] Escape characters

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 115f84172a..4f4b80b0ec 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -252,7 +252,7 @@ steps:
     commands:
       - "source /etc/bashrc"  # Needed to get access to the module command.
       - "module load mpi/openmpi-x86_64"
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -e 'ENV["JULIA_MPI_BINARY"]="system"; using Pkg; Pkg.add("MPI"); Pkg.build("MPI"; verbose=true)'"
+      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -e 'ENV[\"JULIA_MPI_BINARY\"]=\"system\"; using Pkg; Pkg.add(\"MPI\"); Pkg.build(\"MPI\"; verbose=true)'"
       - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans

From fddca94003e3846cb9f33b3acb15616632989047 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 16:36:56 -0500
Subject: [PATCH 084/100] Fixing dispatch for `fill_halo_regions!`: part 2

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index b71693ef68..d4515a7de0 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -61,7 +61,7 @@ end
 fill_halo_regions!(field::AbstractField{LX, LY, LZ}, arch::AbstractMultiArchitecture, args...) where {LX, LY, LZ} =
     fill_halo_regions!(field.data, field.boundary_conditions, arch, field.grid, (LX, LY, LZ), args...)
 
-function fill_halo_regions!(c::AbstractArray, bcs, arch, grid, c_location, args...)
+function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitecture, grid, c_location, args...)
 
     barrier = Event(device(child_architecture(arch)))
 

From e240070b409411072616782f143add25c42a888e Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 17:06:48 -0500
Subject: [PATCH 085/100] Everyone gets system MPI

---
 .buildkite/pipeline.yml         |  8 +++++---
 test/test_distributed_models.jl | 34 ++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 4f4b80b0ec..d8bcef8a99 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -47,6 +47,11 @@ steps:
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.precompile()'"
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.status()'"
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+
+      # Use the system MPI
+      - "source /etc/bashrc"  # Needed to get access to the module command.
+      - "module load mpi/openmpi-x86_64"
+      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'ENV[\"JULIA_MPI_BINARY\"]=\"system\"; using Pkg; Pkg.add(\"MPI\"); Pkg.build(\"MPI\"; verbose=true)'"
     agents:
       queue: Oceananigans
       architecture: CPU
@@ -250,9 +255,6 @@ steps:
       TEST_GROUP: "distributed"
       CUDA_VISIBLE_DEVICES: "-1"
     commands:
-      - "source /etc/bashrc"  # Needed to get access to the module command.
-      - "module load mpi/openmpi-x86_64"
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -e 'ENV[\"JULIA_MPI_BINARY\"]=\"system\"; using Pkg; Pkg.add(\"MPI\"); Pkg.build(\"MPI\"; verbose=true)'"
       - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 318bedbf9b..edb22a7a59 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -458,7 +458,7 @@ end
         run_triply_periodic_local_grid_tests_with_221_ranks()
     end
 
-    # Test pressure bcs!
+    # TODO: Test pressure bcs!
     @testset "Injection of halo communication BCs" begin
         @info "  Testing injection of halo communication BCs..."
         run_triply_periodic_bc_injection_tests_with_411_ranks()
@@ -476,20 +476,20 @@ end
         # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
 
-    @testset "Time stepping" begin
-        topo = (Periodic, Periodic, Periodic)
-        full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-        arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-        dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
-        model = dm.model
-
-        time_step!(model, 1)
-        @test dm isa DistributedIncompressibleModel
-        @test model.clock.time == 1
-
-        simulation = Simulation(model, Δt=1, stop_iteration=2)
-        run!(simulation)
-        @test dm isa DistributedIncompressibleModel
-        @test model.clock.time == 2
-    end
+    # @testset "Time stepping" begin
+    #     topo = (Periodic, Periodic, Periodic)
+    #     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    #     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+    #     dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    #     model = dm.model
+
+    #     time_step!(model, 1)
+    #     @test dm isa DistributedIncompressibleModel
+    #     @test model.clock.time == 1
+
+    #     simulation = Simulation(model, Δt=1, stop_iteration=2)
+    #     run!(simulation)
+    #     @test dm isa DistributedIncompressibleModel
+    #     @test model.clock.time == 2
+    # end
 end

From 56695d77bf73a2d5d1def1556054e987a06bc5c9 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 17:42:48 -0500
Subject: [PATCH 086/100] Use MPI.jl's `mpiexecjl`

---
 .buildkite/pipeline.yml                             | 8 +++-----
 src/Distributed/distributed_incompressible_model.jl | 4 +---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index d8bcef8a99..898d8aabc8 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -48,10 +48,8 @@ steps:
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.status()'"
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
 
-      # Use the system MPI
-      - "source /etc/bashrc"  # Needed to get access to the module command.
-      - "module load mpi/openmpi-x86_64"
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'ENV[\"JULIA_MPI_BINARY\"]=\"system\"; using Pkg; Pkg.add(\"MPI\"); Pkg.build(\"MPI\"; verbose=true)'"
+      # Set up the mpiexecjl command
+      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using MPI; MPI.install_mpiexecjl()'"
     agents:
       queue: Oceananigans
       architecture: CPU
@@ -255,7 +253,7 @@ steps:
       TEST_GROUP: "distributed"
       CUDA_VISIBLE_DEVICES: "-1"
     commands:
-      - "mpiexec -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+      - "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER/bin/mpiexecjl -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans
       architecture: CPU
diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index aaff1788ed..5add326f64 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -20,11 +20,9 @@ function DistributedIncompressibleModel(; architecture, grid, boundary_condition
     Rx, Ry, Rz = architecture.ranks
     my_connectivity = architecture.connectivity
 
-    ## Construct local grid
-
     Nx, Ny, Nz = size(grid)
 
-    # Pull out left and right endpoints for full model.
+    # Pull out endpoints for full model.
     xL, xR = grid.xF[1], grid.xF[Nx+1]
     yL, yR = grid.yF[1], grid.yF[Ny+1]
     zL, zR = grid.zF[1], grid.zF[Nz+1]

From 922f155b2c0c13db50943e0de6d6d35b53a60295 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 17:56:36 -0500
Subject: [PATCH 087/100] Need julia binary in `$PATH` for mpiexecjl to work

---
 .buildkite/pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 898d8aabc8..dcbdd4c7fd 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -253,6 +253,7 @@ steps:
       TEST_GROUP: "distributed"
       CUDA_VISIBLE_DEVICES: "-1"
     commands:
+      - "PATH=$PATH:$TARTARUS_HOME/julia-$JULIA_VERSION/bin"  # Need julia binary in $PATH for mpiexecjl to work.
       - "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER/bin/mpiexecjl -np 4 $TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans

From 0d59c69e078582538c13e2fd32c96cc1d85c4f26 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 20:38:29 -0500
Subject: [PATCH 088/100] Better design for multi-architectures and distributed
 models

---
 src/Architectures.jl                          |  25 +++-
 src/Distributed/Distributed.jl                |   2 +
 .../distributed_incompressible_model.jl       |  28 ++---
 .../distributed_solve_for_pressure.jl         |  25 +---
 src/Distributed/multi_architectures.jl        | 119 ++++++++++--------
 src/Fields/new_data.jl                        |   4 +-
 .../show_incompressible_model.jl              |   4 +-
 src/Oceananigans.jl                           |   2 +-
 src/Simulations/simulation.jl                 |   2 +-
 test/test_distributed_models.jl               |  39 +++---
 10 files changed, 125 insertions(+), 125 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index fbe36ae084..86c08ab1a5 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -2,7 +2,7 @@ module Architectures
 
 export
     @hascuda,
-    AbstractArchitecture, CPU, GPU,
+    AbstractArchitecture, AbstractCPUArchitecture, AbstractGPUArchitecture, CPU, GPU,
     device, architecture, array_type, arch_array
 
 using CUDA
@@ -16,20 +16,35 @@ Abstract supertype for architectures supported by Oceananigans.
 """
 abstract type AbstractArchitecture end
 
+
+"""
+    AbstractCPUArchitecture
+
+Abstract supertype for CPU architectures supported by Oceananigans.
+"""
+abstract type AbstractCPUArchitecture <: AbstractArchitecture end
+
+"""
+    AbstractGPUArchitecture
+
+Abstract supertype for GPU architectures supported by Oceananigans.
+"""
+abstract type AbstractGPUArchitecture <: AbstractArchitecture end
+
 """
     CPU <: AbstractArchitecture
 
 Run Oceananigans on one CPU node. Uses multiple threads if the environment
 variable `JULIA_NUM_THREADS` is set.
 """
-struct CPU <: AbstractArchitecture end
+struct CPU <: AbstractCPUArchitecture end
 
 """
     GPU <: AbstractArchitecture
 
 Run Oceananigans on a single NVIDIA CUDA GPU.
 """
-struct GPU <: AbstractArchitecture end
+struct GPU <: AbstractGPUArchitecture end
 
 """
     @hascuda expr
@@ -41,8 +56,8 @@ macro hascuda(expr)
     return has_cuda() ? :($(esc(expr))) : :(nothing)
 end
 
-device(::CPU) = KernelAbstractions.CPU()
-device(::GPU) = KernelAbstractions.CUDADevice()
+device(::AbstractCPUArchitecture) = KernelAbstractions.CPU()
+device(::AbstractGPUArchitecture) = KernelAbstractions.CUDADevice()
 
 architecture(::Number)  = nothing
 architecture(::Array)   = CPU()
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index a7965b3bca..4fe2bd0389 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -7,6 +7,8 @@ export
     DistributedFFTBasedPoissonSolver,
     DistributedIncompressibleModel
 
+using Oceananigans.Utils
+
 include("distributed_utils.jl")
 include("multi_architectures.jl")
 include("halo_communication_bcs.jl")
diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index 5add326f64..b4ae45e78d 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -5,15 +5,9 @@ using Oceananigans.Grids
 using Oceananigans.Grids: halo_size
 
 #####
-##### Distributed model struct and constructor
+##### Distributed incompressible model constructor
 #####
 
-struct DistributedIncompressibleModel{A, G, M}
-    architecture :: A
-            grid :: G
-           model :: M
-end
-
 function DistributedIncompressibleModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
     my_rank = architecture.my_rank
     i, j, k = architecture.my_index
@@ -21,14 +15,15 @@ function DistributedIncompressibleModel(; architecture, grid, boundary_condition
     my_connectivity = architecture.connectivity
 
     Nx, Ny, Nz = size(grid)
+    Lx, Ly, Lz = length(grid)
 
-    # Pull out endpoints for full model.
+    # Pull out endpoints for full grid.
     xL, xR = grid.xF[1], grid.xF[Nx+1]
     yL, yR = grid.yF[1], grid.yF[Ny+1]
     zL, zR = grid.zF[1], grid.zF[Nz+1]
-    Lx, Ly, Lz = length(grid)
 
     # Make sure we can put an integer number of grid points in each rank.
+    # Will generalize in the future.
     @assert isinteger(Nx / Rx)
     @assert isinteger(Ny / Ry)
     @assert isinteger(Nz / Rz)
@@ -73,23 +68,18 @@ function DistributedIncompressibleModel(; architecture, grid, boundary_condition
     p_bcs = PressureBoundaryConditions(my_grid)
     p_bcs = inject_halo_communication_boundary_conditions(p_bcs, my_rank, my_connectivity)
 
-    pHY′ = CenterField(child_architecture(architecture), my_grid, p_bcs)
-    pNHS = CenterField(child_architecture(architecture), my_grid, p_bcs)
+    pHY′ = CenterField(architecture, my_grid, p_bcs)
+    pNHS = CenterField(architecture, my_grid, p_bcs)
     pressures = (pHY′=pHY′, pNHS=pNHS)
 
     my_model = IncompressibleModel(;
-               architecture = child_architecture(architecture),
+               architecture = architecture,
                        grid = my_grid,
         boundary_conditions = communicative_bcs,
             pressure_solver = pressure_solver,
                   pressures = pressures,
-        model_kwargs...
+                       model_kwargs...
     )
 
-    return DistributedIncompressibleModel(architecture, grid, my_model)
-end
-
-function Base.show(io::IO, dm::DistributedIncompressibleModel)
-    print(io, "DistributedIncompressibleModel with ")
-    print(io, dm.architecture)
+    return my_model
 end
diff --git a/src/Distributed/distributed_solve_for_pressure.jl b/src/Distributed/distributed_solve_for_pressure.jl
index 47f56cef49..5e05e642b3 100644
--- a/src/Distributed/distributed_solve_for_pressure.jl
+++ b/src/Distributed/distributed_solve_for_pressure.jl
@@ -1,24 +1,9 @@
-import Oceananigans.Solvers: solve_for_pressure!
+using Oceananigans.Solvers: calculate_pressure_source_term_fft_based_solver!
 
-function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver, arch, grid, Δt, U★)
+import Oceananigans.Solvers: solve_for_pressure!, source_term_storage, source_term_kernel, solution_storage
 
-    RHS = first(solver.storage)
+source_term_storage(solver::DistributedFFTBasedPoissonSolver) = first(solver.storage)
 
-    rhs_event = launch!(arch, grid, :xyz,
-                        calculate_pressure_right_hand_side!, RHS, arch, grid, Δt, U★,
-                        dependencies = Event(device(arch)))
+source_term_kernel(::DistributedFFTBasedPoissonSolver) = calculate_pressure_source_term_fft_based_solver!
 
-    wait(device(arch), rhs_event)
-
-    solve_poisson_equation!(solver)
-
-    ϕ = first(solver.storage)
-
-    copy_event = launch!(arch, grid, :xyz,
-                         copy_pressure!, pressure, ϕ, arch, grid,
-                         dependencies = Event(device(arch)))
-
-    wait(device(arch), copy_event)
-
-    return nothing
-end
+solution_storage(solver::DistributedFFTBasedPoissonSolver) = first(solver.storage)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 4e71da67da..8cd6f8c60e 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -2,28 +2,37 @@ using Oceananigans.Architectures
 
 using Oceananigans.Grids: topology, validate_tupled_argument
 
-# TODO: Put connectivity inside architecture? MPI should be initialize so you can construct it in there.
-#       Might have to make it MultiCPU(; grid, ranks)
-
-abstract type AbstractMultiArchitecture <: AbstractArchitecture end
+struct MultiCPU{G, R, I, ρ, C} <: AbstractCPUArchitecture
+       full_grid :: G
+         my_rank :: R
+        my_index :: I
+           ranks :: ρ
+    connectivity :: C
+end
 
-struct MultiCPU{R, I, ρ, C} <: AbstractMultiArchitecture
+struct MultiGPU{G, R, I, ρ, C} <: AbstractGPUArchitecture
+       full_grid :: G
          my_rank :: R
         my_index :: I
            ranks :: ρ
     connectivity :: C
 end
 
+const AbstractMultiArchitecture = Union{MultiCPU, MultiGPU}
+
 child_architecture(::MultiCPU) = CPU()
 child_architecture(::CPU) = CPU()
 
+child_architecture(::MultiGPU) = GPU()
+child_architecture(::GPU) = GPU()
+
 #####
 ##### Converting between index and MPI rank taking k as the fast index
 #####
 
-@inline index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
+index2rank(i, j, k, Rx, Ry, Rz) = (i-1)*Ry*Rz + (j-1)*Rz + (k-1)
 
-@inline function rank2index(r, Rx, Ry, Rz)
+function rank2index(r, Rx, Ry, Rz)
     i = div(r, Ry*Rz)
     r -= i*Ry*Rz
     j = div(r, Rz)
@@ -36,64 +45,64 @@ end
 #####
 
 struct RankConnectivity{E, W, N, S, T, B}
-    east :: E
-    west :: W
-   north :: N
-   south :: S
-     top :: T
-  bottom :: B
+      east :: E
+      west :: W
+     north :: N
+     south :: S
+       top :: T
+    bottom :: B
 end
 
 RankConnectivity(; east, west, north, south, top, bottom) =
-  RankConnectivity(east, west, north, south, top, bottom)
+    RankConnectivity(east, west, north, south, top, bottom)
 
 function increment_index(i, R, topo)
-  R == 1 && return nothing
-  if i+1 > R
-      if topo == Periodic
-          return 1
-      else
-          return nothing
-      end
-  else
-      return i+1
-  end
+    R == 1 && return nothing
+    if i+1 > R
+        if topo == Periodic
+            return 1
+        else
+            return nothing
+        end
+    else
+        return i+1
+    end
 end
 
 function decrement_index(i, R, topo)
-  R == 1 && return nothing
-  if i-1 < 1
-      if topo == Periodic
-          return R
-      else
-          return nothing
-      end
-  else
-      return i-1
-  end
+    R == 1 && return nothing
+    if i-1 < 1
+        if topo == Periodic
+            return R
+        else
+            return nothing
+        end
+    else
+        return i-1
+    end
 end
 
 function RankConnectivity(model_index, ranks, topology)
-  i, j, k = model_index
-  Rx, Ry, Rz = ranks
-  TX, TY, TZ = topology
-
-  i_east  = increment_index(i, Rx, TX)
-  i_west  = decrement_index(i, Rx, TX)
-  j_north = increment_index(j, Ry, TY)
-  j_south = decrement_index(j, Ry, TY)
-  k_top   = increment_index(k, Rz, TZ)
-  k_bot   = decrement_index(k, Rz, TZ)
-
-  r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
-  r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
-  r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
-  r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-  r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
-  r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
-
-  return RankConnectivity(east=r_east, west=r_west, north=r_north,
-                          south=r_south, top=r_top, bottom=r_bot)
+    i, j, k = model_index
+    Rx, Ry, Rz = ranks
+    TX, TY, TZ = topology
+
+    i_east  = increment_index(i, Rx, TX)
+    i_west  = decrement_index(i, Rx, TX)
+    j_north = increment_index(j, Ry, TY)
+    j_south = decrement_index(j, Ry, TY)
+    k_top   = increment_index(k, Rz, TZ)
+    k_bot   = decrement_index(k, Rz, TZ)
+
+    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
+    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
+    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
+    r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
+    r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
+
+    return RankConnectivity(east=r_east, west=r_west, north=r_north,
+                            south=r_south, top=r_top, bottom=r_bot)
 end
 
 #####
@@ -122,7 +131,7 @@ function MultiCPU(; grid, ranks)
 
     my_connectivity = RankConnectivity(my_index, ranks, topology(grid))
 
-    return MultiCPU(my_rank, my_index, ranks, my_connectivity)
+    return MultiCPU(grid, my_rank, my_index, ranks, my_connectivity)
 end
 
 #####
diff --git a/src/Fields/new_data.jl b/src/Fields/new_data.jl
index 9297009819..d1da2bb5c2 100644
--- a/src/Fields/new_data.jl
+++ b/src/Fields/new_data.jl
@@ -45,7 +45,7 @@ Returns an `OffsetArray` of zeros of float type `FT`, with
 parent data in CPU memory and indices corresponding to a field on a
 `grid` of `size(grid)` and located at `loc`.
 """
-function new_data(FT, ::CPU, grid, loc)
+function new_data(FT, ::AbstractCPUArchitecture, grid, loc)
     underlying_data = zeros(FT, total_length(loc[1], topology(grid, 1), grid.Nx, grid.Hx),
                                 total_length(loc[2], topology(grid, 2), grid.Ny, grid.Hy),
                                 total_length(loc[3], topology(grid, 3), grid.Nz, grid.Hz))
@@ -60,7 +60,7 @@ Returns an `OffsetArray` of zeros of float type `FT`, with
 parent data in GPU memory and indices corresponding to a field on a `grid`
 of `size(grid)` and located at `loc`.
 """
-function new_data(FT, ::GPU, grid, loc)
+function new_data(FT, ::AbstractGPUArchitecture, grid, loc)
     underlying_data = CuArray{FT}(undef, total_length(loc[1], topology(grid, 1), grid.Nx, grid.Hx),
                                          total_length(loc[2], topology(grid, 2), grid.Ny, grid.Hy),
                                          total_length(loc[3], topology(grid, 3), grid.Nz, grid.Hz))
diff --git a/src/Models/IncompressibleModels/show_incompressible_model.jl b/src/Models/IncompressibleModels/show_incompressible_model.jl
index 19e1229307..96fe62a996 100644
--- a/src/Models/IncompressibleModels/show_incompressible_model.jl
+++ b/src/Models/IncompressibleModels/show_incompressible_model.jl
@@ -1,9 +1,9 @@
-using Oceananigans.Utils: prettytime, ordered_dict_show
 using Oceananigans: short_show
+using Oceananigans.Utils: prettytime, ordered_dict_show
 
 """Show the innards of a `Model` in the REPL."""
 function Base.show(io::IO, model::IncompressibleModel{TS, C, A}) where {TS, C, A}
-    print(io, "IncompressibleModel{$A, $(eltype(model.grid))}",
+    print(io, "IncompressibleModel{$(Base.typename(A)), $(eltype(model.grid))}",
         "(time = $(prettytime(model.clock.time)), iteration = $(model.clock.iteration)) \n",
         "├── grid: $(short_show(model.grid))\n",
         "├── tracers: $(tracernames(model.tracers))\n",
diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl
index 554409df70..a2569b40ee 100644
--- a/src/Oceananigans.jl
+++ b/src/Oceananigans.jl
@@ -76,7 +76,7 @@ export
     ∂x, ∂y, ∂z, @at,
 
     # Distributed
-    MultiCPU,
+    MultiCPU, DistributedIncompressibleModel,
 
     # Utils
     prettytime
diff --git a/src/Simulations/simulation.jl b/src/Simulations/simulation.jl
index 2529cc54e1..a2dae097c2 100644
--- a/src/Simulations/simulation.jl
+++ b/src/Simulations/simulation.jl
@@ -83,7 +83,7 @@ function Simulation(model; Δt,
 end
 
 Base.show(io::IO, s::Simulation) =
-    print(io, "Simulation{$(typeof(s.model).name){$(typeof(s.model.architecture)), $(eltype(s.model.grid))}}\n",
+    print(io, "Simulation{$(typeof(s.model).name){$(Base.typename(typeof(s.model.architecture))), $(eltype(s.model.grid))}}\n",
             "├── Model clock: time = $(prettytime(s.model.clock.time)), iteration = $(s.model.clock.iteration) \n",
             "├── Next time step ($(typeof(s.Δt))): $(prettytime(get_Δt(s.Δt))) \n",
             "├── Iteration interval: $(s.iteration_interval)\n",
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index edb22a7a59..f75a6511d1 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -66,13 +66,13 @@ function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
     @test isnothing(connectivity.bottom)
 
     # +---+
-    # | 0 |
-    # +---+
-    # | 1 |
+    # | 3 |
     # +---+
     # | 2 |
     # +---+
-    # | 3 |
+    # | 1 |
+    # +---+
+    # | 0 |
     # +---+
 
     if my_rank == 0
@@ -476,20 +476,19 @@ end
         # run_triply_periodic_halo_communication_tests_with_221_ranks()
     end
 
-    # @testset "Time stepping" begin
-    #     topo = (Periodic, Periodic, Periodic)
-    #     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
-    #     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    #     dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
-    #     model = dm.model
-
-    #     time_step!(model, 1)
-    #     @test dm isa DistributedIncompressibleModel
-    #     @test model.clock.time == 1
-
-    #     simulation = Simulation(model, Δt=1, stop_iteration=2)
-    #     run!(simulation)
-    #     @test dm isa DistributedIncompressibleModel
-    #     @test model.clock.time == 2
-    # end
+    @testset "Time stepping" begin
+        topo = (Periodic, Periodic, Periodic)
+        full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+        arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
+        model = DistributedIncompressibleModel(architecture=arch, grid=full_grid)
+
+        time_step!(model, 1)
+        @test model isa IncompressibleModel
+        @test model.clock.time == 1
+
+        simulation = Simulation(model, Δt=1, stop_iteration=2)
+        run!(simulation)
+        @test model isa IncompressibleModel
+        @test model.clock.time == 2
+    end
 end

From da22b16addf598aede0686ed6db5df16dc6f9236 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 22:01:14 -0500
Subject: [PATCH 089/100] Update tests and Buildkite zoo

---
 .buildkite/pipeline.yml                  | 10 ++---
 src/Architectures.jl                     |  8 ++--
 test/test_distributed_models.jl          | 48 ++++++++++++------------
 test/test_distributed_poisson_solvers.jl |  4 +-
 4 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dcbdd4c7fd..a185c19a8e 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -166,7 +166,7 @@ steps:
 ##### HydrostaticFreeSurfaceModel
 #####
 
-  - label: "💧 gpu hydrostatic free surface model tests"
+  - label: "🐡 gpu hydrostatic free surface model tests"
     env:
       JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
       TEST_GROUP: "hydrostatic_free_surface"
@@ -177,7 +177,7 @@ steps:
       architecture: GPU
     depends_on: "init_gpu"
 
-  - label: "💦 cpu hydrostatic free surface model tests"
+  - label: "🐠 cpu hydrostatic free surface model tests"
     env:
       JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
       TEST_GROUP: "hydrostatic_free_surface"
@@ -193,7 +193,7 @@ steps:
 ##### ShallowWaterModel
 #####
 
-  - label: "💧 gpu shallow water model tests"
+  - label: "🦑 gpu shallow water model tests"
     env:
       JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
       TEST_GROUP: "shallow_water"
@@ -204,7 +204,7 @@ steps:
       architecture: GPU
     depends_on: "init_gpu"
 
-  - label: "💦 cpu shallow water model tests"
+  - label: "🦐 cpu shallow water model tests"
     env:
       JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
       TEST_GROUP: "shallow_water"
@@ -337,7 +337,7 @@ steps:
 ##### Clean up
 #####
 
-  - label: "🧻 clean up gpu environment"
+  - label: "🧽 clean up gpu environment"
     command: "rm -rf $SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
     agents:
       queue: Oceananigans
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 86c08ab1a5..b4b2293af1 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -66,9 +66,9 @@ architecture(::CuArray) = GPU()
 array_type(::CPU) = Array
 array_type(::GPU) = CuArray
 
-arch_array(::CPU, A::Array) = A
-arch_array(::CPU, A::CuArray) = Array(A)
-arch_array(::GPU, A::Array) = CuArray(A)
-arch_array(::GPU, A::CuArray) = A
+arch_array(::AbstractCPUArchitecture, A::Array) = A
+arch_array(::AbstractCPUArchitecture, A::CuArray) = Array(A)
+arch_array(::AbstractGPUArchitecture, A::Array) = CuArray(A)
+arch_array(::AbstractGPUArchitecture, A::CuArray) = A
 
 end
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index f75a6511d1..52703c750f 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -191,10 +191,10 @@ function run_triply_periodic_local_grid_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    local_grid = dm.model.grid
+    local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
     @test local_grid.xF[1] == 0.25*my_rank
@@ -211,10 +211,10 @@ function run_triply_periodic_local_grid_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    local_grid = dm.model.grid
+    local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
     @test local_grid.xF[1] == 0
@@ -231,10 +231,10 @@ function run_triply_periodic_local_grid_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    local_grid = dm.model.grid
+    local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
     @test local_grid.xF[1] == 0
@@ -251,10 +251,10 @@ function run_triply_periodic_local_grid_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     i, j, k = arch.my_index
-    local_grid = dm.model.grid
+    local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
     @test local_grid.xF[1] == 0.5*(i-1)
@@ -275,9 +275,9 @@ function run_triply_periodic_bc_injection_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         fbcs = field.boundary_conditions
         @test fbcs.east isa HaloCommunicationBC
         @test fbcs.west isa HaloCommunicationBC
@@ -292,9 +292,9 @@ function run_triply_periodic_bc_injection_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         fbcs = field.boundary_conditions
         @test !isa(fbcs.east, HaloCommunicationBC)
         @test !isa(fbcs.west, HaloCommunicationBC)
@@ -309,9 +309,9 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         fbcs = field.boundary_conditions
         @test !isa(fbcs.east, HaloCommunicationBC)
         @test !isa(fbcs.west, HaloCommunicationBC)
@@ -326,9 +326,9 @@ function run_triply_periodic_bc_injection_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         fbcs = field.boundary_conditions
         @test fbcs.east isa HaloCommunicationBC
         @test fbcs.west isa HaloCommunicationBC
@@ -347,9 +347,9 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -370,9 +370,9 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -393,9 +393,9 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -416,9 +416,9 @@ function run_triply_periodic_halo_communication_tests_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(dm.model)
+    for field in fields(model)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index c730d72895..7726f5cd94 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -39,9 +39,9 @@ function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=grid_points, extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=ranks)
-    dm = DistributedIncompressibleModel(architecture=arch, grid=full_grid)
+    model = DistributedIncompressibleModel(architecture=arch, grid=full_grid)
 
-    local_grid = dm.model.grid
+    local_grid = model.grid
     solver = DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
 
     R = random_divergent_source_term(Float64, child_architecture(arch), local_grid)

From 4baba22dc675b69dc5945b5567aac288311e62e1 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 23:37:40 -0500
Subject: [PATCH 090/100] More tests

---
 .../distributed_incompressible_model.jl       |  10 +-
 src/Oceananigans.jl                           |   3 -
 test/test_distributed_models.jl               | 100 +++++++++---------
 3 files changed, 55 insertions(+), 58 deletions(-)

diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index b4ae45e78d..72bd3e6c57 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -45,11 +45,11 @@ function DistributedIncompressibleModel(; architecture, grid, boundary_condition
     bcs = isnothing(boundary_conditions) ? NamedTuple() : boundary_conditions
 
     bcs = (
-        u = haskey(bcs, :u) ? bcs.u : UVelocityBoundaryConditions(grid),
-        v = haskey(bcs, :v) ? bcs.v : VVelocityBoundaryConditions(grid),
-        w = haskey(bcs, :w) ? bcs.w : WVelocityBoundaryConditions(grid),
-        T = haskey(bcs, :T) ? bcs.T : TracerBoundaryConditions(grid),
-        S = haskey(bcs, :S) ? bcs.S : TracerBoundaryConditions(grid)
+        u = haskey(bcs, :u) ? bcs.u : UVelocityBoundaryConditions(my_grid),
+        v = haskey(bcs, :v) ? bcs.v : VVelocityBoundaryConditions(my_grid),
+        w = haskey(bcs, :w) ? bcs.w : WVelocityBoundaryConditions(my_grid),
+        T = haskey(bcs, :T) ? bcs.T : TracerBoundaryConditions(my_grid),
+        S = haskey(bcs, :S) ? bcs.S : TracerBoundaryConditions(my_grid)
     )
 
     communicative_bcs = (
diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl
index a2569b40ee..2434cd06d5 100644
--- a/src/Oceananigans.jl
+++ b/src/Oceananigans.jl
@@ -75,9 +75,6 @@ export
     # Abstract operations
     ∂x, ∂y, ∂z, @at,
 
-    # Distributed
-    MultiCPU, DistributedIncompressibleModel,
-
     # Utils
     prettytime
 
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 52703c750f..81b31d5e34 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -12,7 +12,7 @@ mpi_ranks = MPI.Comm_size(comm)
 ##### Multi architectures and rank connectivity
 #####
 
-function run_triply_periodic_rank_connectivity_tests_with_411_ranks()
+function test_triply_periodic_rank_connectivity_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
@@ -49,7 +49,7 @@ function run_triply_periodic_rank_connectivity_tests_with_411_ranks()
     return nothing
 end
 
-function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
+function test_triply_periodic_rank_connectivity_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
@@ -92,7 +92,7 @@ function run_triply_periodic_rank_connectivity_tests_with_141_ranks()
     return nothing
 end
 
-function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
+function test_triply_periodic_rank_connectivity_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
@@ -138,7 +138,7 @@ function run_triply_periodic_rank_connectivity_tests_with_114_ranks()
     return nothing
 end
 
-function run_triply_periodic_rank_connectivity_tests_with_221_ranks()
+function test_triply_periodic_rank_connectivity_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
@@ -187,7 +187,7 @@ end
 ##### Local grids for distributed models
 #####
 
-function run_triply_periodic_local_grid_tests_with_411_ranks()
+function test_triply_periodic_local_grid_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
@@ -207,7 +207,7 @@ function run_triply_periodic_local_grid_tests_with_411_ranks()
     return nothing
 end
 
-function run_triply_periodic_local_grid_tests_with_141_ranks()
+function test_triply_periodic_local_grid_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
@@ -227,7 +227,7 @@ function run_triply_periodic_local_grid_tests_with_141_ranks()
     return nothing
 end
 
-function run_triply_periodic_local_grid_tests_with_114_ranks()
+function test_triply_periodic_local_grid_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
@@ -247,7 +247,7 @@ function run_triply_periodic_local_grid_tests_with_114_ranks()
     return nothing
 end
 
-function run_triply_periodic_local_grid_tests_with_221_ranks()
+function test_triply_periodic_local_grid_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
@@ -271,13 +271,13 @@ end
 ##### Injection of halo communication BCs
 #####
 
-function run_triply_periodic_bc_injection_tests_with_411_ranks()
+function test_triply_periodic_bc_injection_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         fbcs = field.boundary_conditions
         @test fbcs.east isa HaloCommunicationBC
         @test fbcs.west isa HaloCommunicationBC
@@ -288,13 +288,13 @@ function run_triply_periodic_bc_injection_tests_with_411_ranks()
     end
 end
 
-function run_triply_periodic_bc_injection_tests_with_141_ranks()
+function test_triply_periodic_bc_injection_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         fbcs = field.boundary_conditions
         @test !isa(fbcs.east, HaloCommunicationBC)
         @test !isa(fbcs.west, HaloCommunicationBC)
@@ -305,13 +305,13 @@ function run_triply_periodic_bc_injection_tests_with_141_ranks()
     end
 end
 
-function run_triply_periodic_bc_injection_tests_with_114_ranks()
+function test_triply_periodic_bc_injection_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         fbcs = field.boundary_conditions
         @test !isa(fbcs.east, HaloCommunicationBC)
         @test !isa(fbcs.west, HaloCommunicationBC)
@@ -322,13 +322,13 @@ function run_triply_periodic_bc_injection_tests_with_114_ranks()
     end
 end
 
-function run_triply_periodic_bc_injection_tests_with_221_ranks()
+function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         fbcs = field.boundary_conditions
         @test fbcs.east isa HaloCommunicationBC
         @test fbcs.west isa HaloCommunicationBC
@@ -343,13 +343,13 @@ end
 ##### Halo communication
 #####
 
-function run_triply_periodic_halo_communication_tests_with_411_ranks()
+function test_triply_periodic_halo_communication_with_411_ranks(halo)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 6, 4), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(16, 6, 4), extent=(1, 2, 3), halo=halo)
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -366,13 +366,13 @@ function run_triply_periodic_halo_communication_tests_with_411_ranks()
     return nothing
 end
 
-function run_triply_periodic_halo_communication_tests_with_141_ranks()
+function test_triply_periodic_halo_communication_with_141_ranks(halo)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularRectilinearGrid(topology=topo, size=(3, 8, 2), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(4, 16, 4), extent=(1, 2, 3), halo=halo)
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -389,13 +389,13 @@ function run_triply_periodic_halo_communication_tests_with_141_ranks()
     return nothing
 end
 
-function run_triply_periodic_halo_communication_tests_with_114_ranks()
+function test_triply_periodic_halo_communication_with_114_ranks(halo)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularRectilinearGrid(topology=topo, size=(3, 5, 8), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(4, 4, 16), extent=(1, 2, 3), halo=halo)
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
@@ -412,20 +412,20 @@ function run_triply_periodic_halo_communication_tests_with_114_ranks()
     return nothing
 end
 
-function run_triply_periodic_halo_communication_tests_with_221_ranks()
+function test_triply_periodic_halo_communication_with_221_ranks(halo)
     topo = (Periodic, Periodic, Periodic)
-    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3))
+    full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    for field in fields(model)
+    for field in merge(fields(model), model.pressures)
         interior(field) .= arch.my_rank
         fill_halo_regions!(field, arch)
 
-        @test all(east_halo(field) .== arch.connectivity.east)
-        @test all(west_halo(field) .== arch.connectivity.west)
-        @test all(north_halo(field) .== arch.connectivity.north)
-        @test all(south_halo(field) .== arch.connectivity.south)
+        @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
+        @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
+        @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
+        @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
 
         @test all(interior(field) .== arch.my_rank)
         @test all(top_halo(field, include_corners=false) .== arch.my_rank)
@@ -444,36 +444,36 @@ end
 
     @testset "Multi architectures rank connectivity" begin
         @info "  Testing multi architecture rank connectivity..."
-        run_triply_periodic_rank_connectivity_tests_with_411_ranks()
-        run_triply_periodic_rank_connectivity_tests_with_141_ranks()
-        run_triply_periodic_rank_connectivity_tests_with_114_ranks()
-        run_triply_periodic_rank_connectivity_tests_with_221_ranks()
+        test_triply_periodic_rank_connectivity_with_411_ranks()
+        test_triply_periodic_rank_connectivity_with_141_ranks()
+        test_triply_periodic_rank_connectivity_with_114_ranks()
+        test_triply_periodic_rank_connectivity_with_221_ranks()
     end
 
     @testset "Local grids for distributed models" begin
         @info "  Testing local grids for distributed models..."
-        run_triply_periodic_local_grid_tests_with_411_ranks()
-        run_triply_periodic_local_grid_tests_with_141_ranks()
-        run_triply_periodic_local_grid_tests_with_114_ranks()
-        run_triply_periodic_local_grid_tests_with_221_ranks()
+        test_triply_periodic_local_grid_with_411_ranks()
+        test_triply_periodic_local_grid_with_141_ranks()
+        test_triply_periodic_local_grid_with_114_ranks()
+        test_triply_periodic_local_grid_with_221_ranks()
     end
 
-    # TODO: Test pressure bcs!
     @testset "Injection of halo communication BCs" begin
         @info "  Testing injection of halo communication BCs..."
-        run_triply_periodic_bc_injection_tests_with_411_ranks()
-        run_triply_periodic_bc_injection_tests_with_141_ranks()
-        run_triply_periodic_bc_injection_tests_with_114_ranks()
-        run_triply_periodic_bc_injection_tests_with_221_ranks()
+        test_triply_periodic_bc_injection_with_411_ranks()
+        test_triply_periodic_bc_injection_with_141_ranks()
+        test_triply_periodic_bc_injection_with_114_ranks()
+        test_triply_periodic_bc_injection_with_221_ranks()
     end
 
-    # TODO: Test larger halos!
     @testset "Halo communication" begin
         @info "  Testing halo communication..."
-        run_triply_periodic_halo_communication_tests_with_411_ranks()
-        run_triply_periodic_halo_communication_tests_with_141_ranks()
-        run_triply_periodic_halo_communication_tests_with_114_ranks()
-        # run_triply_periodic_halo_communication_tests_with_221_ranks()
+        for H in 1:3
+            test_triply_periodic_halo_communication_with_411_ranks((H, H, H))
+            test_triply_periodic_halo_communication_with_141_ranks((H, H, H))
+            test_triply_periodic_halo_communication_with_114_ranks((H, H, H))
+            test_triply_periodic_halo_communication_with_221_ranks((H, H, H))
+        end
     end
 
     @testset "Time stepping" begin

From f7eb66413272af1faeccf709812fad1a3b76c913 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Thu, 4 Mar 2021 23:56:47 -0500
Subject: [PATCH 091/100] Address PR comments

---
 src/Distributed/halo_communication.jl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d4515a7de0..1a0d2bade4 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -10,12 +10,7 @@ import Oceananigans.BoundaryConditions:
 #####
 
 sides  = (:west, :east, :south, :north, :top, :bottom)
-
-side_id = Dict(
-    :east => 1, :west => 2,
-    :north => 3, :south => 4,
-    :top => 5, :bottom => 6
-)
+side_id = Dict(side => n for (n, side) in enumerate(sides))
 
 opposite_side = Dict(
     :east => :west, :west => :east,
@@ -23,16 +18,14 @@ opposite_side = Dict(
     :top => :bottom, :bottom => :top
 )
 
-# Unfortunately can't call MPI.Comm_size(MPI.COMM_WORLD) before MPI.Init().
-MAX_RANKS = 10^3
-RANK_DIGITS = 3
-
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
 #   digit 1: the side
 #   digits 2-4: the from rank
 #   digits 5-7: the to rank
 
+RANK_DIGITS = 3
+
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")

From faa07293d7f4e71c03064b30c573718c10a3b416 Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Fri, 5 Mar 2021 09:07:38 -0500
Subject: [PATCH 092/100] Bump v0.53.0

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 400c8bd74f..855bc1e7d2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Oceananigans"
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
-version = "0.52.1"
+version = "0.53.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 42d2eab2a94bb17ab5b2e77fbc5ee128a79fc345 Mon Sep 17 00:00:00 2001
From: Ali Ramadhan <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 09:40:49 -0500
Subject: [PATCH 093/100] Update src/Distributed/halo_communication.jl

---
 src/Distributed/halo_communication.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1a0d2bade4..4a39a16d4b 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -21,8 +21,8 @@ opposite_side = Dict(
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
 #   digit 1: the side
-#   digits 2-4: the from rank
-#   digits 5-7: the to rank
+#   digits 2-4: the "from" rank
+#   digits 5-7: the "to" rank
 
 RANK_DIGITS = 3
 

From 3ae697f87420943b3bd0f9cff9c0d4150cce8eea Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 21:02:38 -0500
Subject: [PATCH 094/100] Inject halo communication BCs in `Field` constructor

---
 src/Distributed/Distributed.jl                |  1 +
 src/Distributed/distributed_fields.jl         | 10 +++++
 .../distributed_incompressible_model.jl       | 43 +++----------------
 src/Fields/field.jl                           | 16 +++----
 4 files changed, 25 insertions(+), 45 deletions(-)
 create mode 100644 src/Distributed/distributed_fields.jl

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 4fe2bd0389..7c1973c807 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -13,6 +13,7 @@ include("distributed_utils.jl")
 include("multi_architectures.jl")
 include("halo_communication_bcs.jl")
 include("halo_communication.jl")
+include("distributed_fields.jl")
 include("distributed_fft_based_poisson_solver.jl")
 include("distributed_solve_for_pressure.jl")
 include("distributed_incompressible_model.jl")
diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
new file mode 100644
index 0000000000..f73878a7bc
--- /dev/null
+++ b/src/Distributed/distributed_fields.jl
@@ -0,0 +1,10 @@
+import Oceananigans.Fields: Field
+
+function Field(X, Y, Z, arch::AbstractMultiArchitecture, grid,
+                bcs = FieldBoundaryConditions(grid, (X, Y, Z)),
+               data = new_data(eltype(grid), arch, grid, (X, Y, Z)))
+
+    communicative_bcs = inject_halo_communication_boundary_conditions(bcs, arch.my_rank, arch.connectivity)
+
+    return Field(X, Y, Z, child_architecture(arch), grid, communicative_bcs, data)
+end
diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index 72bd3e6c57..c167be41ce 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -8,7 +8,7 @@ using Oceananigans.Grids: halo_size
 ##### Distributed incompressible model constructor
 #####
 
-function DistributedIncompressibleModel(; architecture, grid, boundary_conditions=nothing, model_kwargs...)
+function DistributedIncompressibleModel(; architecture, grid, model_kwargs...)
     my_rank = architecture.my_rank
     i, j, k = architecture.my_index
     Rx, Ry, Rz = architecture.ranks
@@ -35,50 +35,19 @@ function DistributedIncompressibleModel(; architecture, grid, boundary_condition
     y₁, y₂ = yL + (j-1)*ly, yL + j*ly
     z₁, z₂ = zL + (k-1)*lz, zL + k*lz
 
-    # FIXME: local grid might have different topology!
+    # FIXME? local grid might have different topology!
     my_grid = RegularRectilinearGrid(topology=topology(grid), size=(nx, ny, nz), x=(x₁, x₂), y=(y₁, y₂), z=(z₁, z₂), halo=halo_size(grid))
 
-    ## Change appropriate boundary conditions to halo communication BCs
-
-    # FIXME: Stop assuming (u, v, w, T, S).
-
-    bcs = isnothing(boundary_conditions) ? NamedTuple() : boundary_conditions
-
-    bcs = (
-        u = haskey(bcs, :u) ? bcs.u : UVelocityBoundaryConditions(my_grid),
-        v = haskey(bcs, :v) ? bcs.v : VVelocityBoundaryConditions(my_grid),
-        w = haskey(bcs, :w) ? bcs.w : WVelocityBoundaryConditions(my_grid),
-        T = haskey(bcs, :T) ? bcs.T : TracerBoundaryConditions(my_grid),
-        S = haskey(bcs, :S) ? bcs.S : TracerBoundaryConditions(my_grid)
-    )
-
-    communicative_bcs = (
-        u = inject_halo_communication_boundary_conditions(bcs.u, my_rank, my_connectivity),
-        v = inject_halo_communication_boundary_conditions(bcs.v, my_rank, my_connectivity),
-        w = inject_halo_communication_boundary_conditions(bcs.w, my_rank, my_connectivity),
-        T = inject_halo_communication_boundary_conditions(bcs.T, my_rank, my_connectivity),
-        S = inject_halo_communication_boundary_conditions(bcs.S, my_rank, my_connectivity)
-    )
-
     ## Construct local model
 
     pressure_solver = haskey(model_kwargs, :pressure_solver) ? Dict(model_kwargs)[:pressure_solver] :
                                                                DistributedFFTBasedPoissonSolver(architecture, grid, my_grid)
 
-    p_bcs = PressureBoundaryConditions(my_grid)
-    p_bcs = inject_halo_communication_boundary_conditions(p_bcs, my_rank, my_connectivity)
-
-    pHY′ = CenterField(architecture, my_grid, p_bcs)
-    pNHS = CenterField(architecture, my_grid, p_bcs)
-    pressures = (pHY′=pHY′, pNHS=pNHS)
-
     my_model = IncompressibleModel(;
-               architecture = architecture,
-                       grid = my_grid,
-        boundary_conditions = communicative_bcs,
-            pressure_solver = pressure_solver,
-                  pressures = pressures,
-                       model_kwargs...
+           architecture = architecture,
+                   grid = my_grid,
+        pressure_solver = pressure_solver,
+                   model_kwargs...
     )
 
     return my_model
diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index 6389e7a665..c350423415 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -78,7 +78,7 @@ function CenterField(FT::DataType, arch, grid,
                     bcs = TracerBoundaryConditions(grid),
                    data = new_data(FT, arch, grid, (Center, Center, Center)))
 
-    return Field{Center, Center, Center}(data, grid, bcs)
+    return Field(Center, Center, Center, arch, grid, bcs, data)
 end
 
 """
@@ -93,7 +93,7 @@ function XFaceField(FT::DataType, arch, grid,
                      bcs = UVelocityBoundaryConditions(grid),
                     data = new_data(FT, arch, grid, (Face, Center, Center)))
 
-    return Field{Face, Center, Center}(data, grid, bcs)
+    return Field(Face, Center, Center, arch, grid, bcs, data)
 end
 
 """
@@ -108,7 +108,7 @@ function YFaceField(FT::DataType, arch, grid,
                      bcs = VVelocityBoundaryConditions(grid),
                     data = new_data(FT, arch, grid, (Center, Face, Center)))
 
-    return Field{Center, Face, Center}(data, grid, bcs)
+    return Field(Center, Face, Center, arch, grid, bcs, data)
 end
 
 """
@@ -123,13 +123,13 @@ function ZFaceField(FT::DataType, arch, grid,
                      bcs = WVelocityBoundaryConditions(grid),
                     data = new_data(FT, arch, grid, (Center, Center, Face)))
 
-    return Field{Center, Center, Face}(data, grid, bcs)
+    return Field(Center, Center, Face, arch, grid, bcs, data)
 end
 
- CenterField(arch::AbstractArchitecture, grid, args...) =  CenterField(eltype(grid), arch, grid, args...)
-XFaceField(arch::AbstractArchitecture, grid, args...) = XFaceField(eltype(grid), arch, grid, args...)
-YFaceField(arch::AbstractArchitecture, grid, args...) = YFaceField(eltype(grid), arch, grid, args...)
-ZFaceField(arch::AbstractArchitecture, grid, args...) = ZFaceField(eltype(grid), arch, grid, args...)
+CenterField(arch::AbstractArchitecture, grid, args...) = CenterField(eltype(grid), arch, grid, args...)
+ XFaceField(arch::AbstractArchitecture, grid, args...) =  XFaceField(eltype(grid), arch, grid, args...)
+ YFaceField(arch::AbstractArchitecture, grid, args...) =  YFaceField(eltype(grid), arch, grid, args...)
+ ZFaceField(arch::AbstractArchitecture, grid, args...) =  ZFaceField(eltype(grid), arch, grid, args...)
 
 @propagate_inbounds Base.setindex!(f::Field, v, inds...) = @inbounds setindex!(f.data, v, inds...)
 

From 50c9b7a1230206d28cf8d9e564601a7acdb9c7de Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 21:12:21 -0500
Subject: [PATCH 095/100] Nuke sandbox

---
 sandbox/mpi_turbulence.jl | 80 ---------------------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 sandbox/mpi_turbulence.jl

diff --git a/sandbox/mpi_turbulence.jl b/sandbox/mpi_turbulence.jl
deleted file mode 100644
index d5b9824ad7..0000000000
--- a/sandbox/mpi_turbulence.jl
+++ /dev/null
@@ -1,80 +0,0 @@
-using MPI
-
-MPI.Initialized() || MPI.Init()
-
-using Statistics
-
-using Oceananigans.Advection
-using Oceananigans.Fields
-using Oceananigans.OutputWriters
-using Oceananigans.AbstractOperations
-using Oceananigans.Utils
-
-using Oceananigans.Solvers: calculate_pressure_right_hand_side!, copy_pressure!
-
-topo = (Periodic, Periodic, Periodic)
-full_grid = RegularRectilinearGrid(topology=topo, size=(512, 512, 1), extent=(4π, 4π, 1), halo=(3, 3, 3))
-arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
-
-dm = DistributedIncompressibleModel(
-    architecture = arch,
-            grid = full_grid,
-     timestepper = :RungeKutta3,
-       advection = WENO5(),
-         closure = IsotropicDiffusivity(ν=1e-5)
-)
-
-model = dm.model
-u₀ = rand(size(model.grid)...);
-u₀ .-= mean(u₀);
-set!(model, u=u₀, v=u₀)
-
-progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
-simulation = Simulation(model, Δt=0.05, stop_time=50, iteration_interval=1, progress=progress)
-
-u, v, w = model.velocities
-outputs = (ζ=ComputedField(∂x(v) - ∂y(u)),)
-simulation.output_writers[:fields] = NetCDFOutputWriter(model, outputs, filepath="mpi_turbulence_rank$(arch.my_rank).nc", schedule=TimeInterval(0.1))
-
-MPI.Barrier(MPI.COMM_WORLD)
-
-run!(simulation)
-
-using Printf
-using NCDatasets
-using CairoMakie
-
-if arch.my_rank == 0
-    ranks = 4
-
-    ds = [NCDataset("mpi_turbulence_rank$r.nc") for r in 0:ranks-1]
-
-    frame = Node(1)
-    plot_title = @lift @sprintf("Oceananigans.jl + MPI: 2D turbulence t = %.1f", ds[1]["time"][$frame])
-    ζ = [@lift ds[r]["ζ"][:, :, 1, $frame] for r in 1:ranks]
-
-    fig = Figure(resolution=(1600, 1200))
-
-    for r in reverse(1:ranks)
-        ax = fig[ranks-r+1, 1] = Axis(fig, ylabel="rank $(r-1)", xticks = MultiplesTicks(9, pi, "π"),  yticks = MultiplesTicks(3, pi, "π"))
-        hm = CairoMakie.heatmap!(ax, ds[r]["xF"], ds[r]["yF"], ζ[r], colormap=:balance, colorrange=(-2, 2))
-        r > 1 && hidexdecorations!(ax, grid=false)
-        if r == 1
-            cb = fig[:, 2] = Colorbar(fig, hm, label = "Vorticity ζ = ∂x(v) - ∂y(u)", width=30)
-            cb.height = Relative(2/3)
-        end
-        xlims!(ax, [0, 4π])
-        ylims!(ax, [(r-1)*π, r*π])
-    end
-
-    supertitle = fig[0, :] = Label(fig, plot_title, textsize=30)
-
-    trim!(fig.layout)
-
-    record(fig, "mpi_turbulence.mp4", 1:length(ds[1]["time"])-1, framerate=30) do n
-        @info "Animating MPI turbulence frame $n/$(length(ds[1]["time"]))..."
-        frame[] = n
-    end
-
-    [close(d) for d in ds]
-end

From 3a4f9ea8c15167d16728612613c182b1abc7888e Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 21:40:45 -0500
Subject: [PATCH 096/100] Some renaming and added a communicator to `MultiCPU`

---
 .../distributed_fft_based_poisson_solver.jl   |   2 +-
 src/Distributed/distributed_fields.jl         |   2 +-
 .../distributed_incompressible_model.jl       |   3 +-
 src/Distributed/halo_communication.jl         |  30 ++---
 src/Distributed/halo_communication_bcs.jl     |  14 +--
 src/Distributed/multi_architectures.jl        |  42 +++----
 test/test_distributed_models.jl               | 112 +++++++++---------
 test/test_distributed_poisson_solvers.jl      |   4 +-
 8 files changed, 104 insertions(+), 105 deletions(-)

diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 5595e1b35c..039e1f2238 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -17,7 +17,7 @@ function DistributedFFTBasedPoissonSolver(arch, full_grid, local_grid)
     λy = poisson_eigenvalues(full_grid.Ny, full_grid.Ly, 2, TY())
     λz = poisson_eigenvalues(full_grid.Nz, full_grid.Lz, 3, TZ())
 
-    I, J, K = arch.my_index
+    I, J, K = arch.local_index
     λx = λx[(J-1)*local_grid.Ny+1:J*local_grid.Ny, :, :]
 
     eigenvalues = (; λx, λy, λz)
diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
index f73878a7bc..3d9a345577 100644
--- a/src/Distributed/distributed_fields.jl
+++ b/src/Distributed/distributed_fields.jl
@@ -4,7 +4,7 @@ function Field(X, Y, Z, arch::AbstractMultiArchitecture, grid,
                 bcs = FieldBoundaryConditions(grid, (X, Y, Z)),
                data = new_data(eltype(grid), arch, grid, (X, Y, Z)))
 
-    communicative_bcs = inject_halo_communication_boundary_conditions(bcs, arch.my_rank, arch.connectivity)
+    communicative_bcs = inject_halo_communication_boundary_conditions(bcs, arch.local_rank, arch.connectivity)
 
     return Field(X, Y, Z, child_architecture(arch), grid, communicative_bcs, data)
 end
diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index c167be41ce..2379586113 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -9,8 +9,7 @@ using Oceananigans.Grids: halo_size
 #####
 
 function DistributedIncompressibleModel(; architecture, grid, model_kwargs...)
-    my_rank = architecture.my_rank
-    i, j, k = architecture.my_index
+    i, j, k = architecture.local_index
     Rx, Ry, Rz = architecture.ranks
     my_connectivity = architecture.connectivity
 
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 4a39a16d4b..95e880428d 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -31,16 +31,16 @@ for side in sides
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        function $send_tag_fn_name(my_rank, rank_to_send_to)
-            from_digits = string(my_rank, pad=RANK_DIGITS)
+        function $send_tag_fn_name(local_rank, rank_to_send_to)
+            from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit = string(side_id[Symbol($side_str)])
             return parse(Int, from_digits * to_digits * side_digit)
         end
 
-        function $recv_tag_fn_name(my_rank, rank_to_recv_from)
+        function $recv_tag_fn_name(local_rank, rank_to_recv_from)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
-            to_digits = string(my_rank, pad=RANK_DIGITS)
+            to_digits = string(local_rank, pad=RANK_DIGITS)
             side_digit = string(side_id[opposite_side[Symbol($side_str)]])
             return parse(Int, from_digits * to_digits * side_digit)
         end
@@ -105,13 +105,13 @@ for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom]
     @eval begin
         function $fill_both_halos!(c, bc_side::HaloCommunicationBC, bc_opposite_side::HaloCommunicationBC, arch, barrier, grid, c_location, args...)
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
-            my_rank = bc_side.condition.from
+            local_rank = bc_side.condition.from
 
-            $send_side_halo(c, grid, c_location, my_rank, bc_side.condition.to)
-            $send_opposite_side_halo(c, grid, c_location, my_rank, bc_opposite_side.condition.to)
+            $send_side_halo(c, grid, c_location, local_rank, bc_side.condition.to)
+            $send_opposite_side_halo(c, grid, c_location, local_rank, bc_opposite_side.condition.to)
 
-            $recv_and_fill_side_halo!(c, grid, c_location, my_rank, bc_side.condition.to)
-            $recv_and_fill_opposite_side_halo!(c, grid, c_location, my_rank, bc_opposite_side.condition.to)
+            $recv_and_fill_side_halo!(c, grid, c_location, local_rank, bc_side.condition.to)
+            $recv_and_fill_opposite_side_halo!(c, grid, c_location, local_rank, bc_opposite_side.condition.to)
 
             return nothing, nothing
         end
@@ -129,11 +129,11 @@ for side in sides
     side_send_tag = Symbol("$(side)_send_tag")
 
     @eval begin
-        function $send_side_halo(c, grid, c_location, my_rank, rank_to_send_to)
+        function $send_side_halo(c, grid, c_location, local_rank, rank_to_send_to)
             send_buffer = $underlying_side_boundary(c, grid, c_location)
-            send_tag = $side_send_tag(my_rank, rank_to_send_to)
+            send_tag = $side_send_tag(local_rank, rank_to_send_to)
 
-            @debug "Sending " * $side_str * " halo: my_rank=$my_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
+            @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             status = MPI.Isend(send_buffer, rank_to_send_to, send_tag, MPI.COMM_WORLD)
 
             return status
@@ -152,11 +152,11 @@ for side in sides
     side_recv_tag = Symbol("$(side)_recv_tag")
 
     @eval begin
-        function $recv_and_fill_side_halo!(c, grid, c_location, my_rank, rank_to_recv_from)
+        function $recv_and_fill_side_halo!(c, grid, c_location, local_rank, rank_to_recv_from)
             recv_buffer = $underlying_side_halo(c, grid, c_location)
-            recv_tag = $side_recv_tag(my_rank, rank_to_recv_from)
+            recv_tag = $side_recv_tag(local_rank, rank_to_recv_from)
 
-            @debug "Receiving " * $side_str * " halo: my_rank=$my_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
+            @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             MPI.Recv!(recv_buffer, rank_to_recv_from, recv_tag, MPI.COMM_WORLD)
 
             return nothing
diff --git a/src/Distributed/halo_communication_bcs.jl b/src/Distributed/halo_communication_bcs.jl
index ca72f7c485..56f2c511f7 100644
--- a/src/Distributed/halo_communication_bcs.jl
+++ b/src/Distributed/halo_communication_bcs.jl
@@ -20,7 +20,7 @@ HaloCommunicationRanks(; from, to) = HaloCommunicationRanks(from, to)
 
 print_condition(hcr::HaloCommunicationRanks) = "(from rank $(hcr.from) to rank $(hcr.to))"
 
-function inject_halo_communication_boundary_conditions(field_bcs, my_rank, connectivity)
+function inject_halo_communication_boundary_conditions(field_bcs, local_rank, connectivity)
     rank_east = connectivity.east
     rank_west = connectivity.west
     rank_north = connectivity.north
@@ -28,12 +28,12 @@ function inject_halo_communication_boundary_conditions(field_bcs, my_rank, conne
     rank_top = connectivity.top
     rank_bottom = connectivity.bottom
 
-    east_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_east)
-    west_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_west)
-    north_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_north)
-    south_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_south)
-    top_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_top)
-    bottom_comm_ranks = HaloCommunicationRanks(from=my_rank, to=rank_bottom)
+    east_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_east)
+    west_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_west)
+    north_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_north)
+    south_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_south)
+    top_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_top)
+    bottom_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_bottom)
 
     east_comm_bc = HaloCommunicationBoundaryCondition(east_comm_ranks)
     west_comm_bc = HaloCommunicationBoundaryCondition(west_comm_ranks)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 8cd6f8c60e..122711771a 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -2,20 +2,22 @@ using Oceananigans.Architectures
 
 using Oceananigans.Grids: topology, validate_tupled_argument
 
-struct MultiCPU{G, R, I, ρ, C} <: AbstractCPUArchitecture
-       full_grid :: G
-         my_rank :: R
-        my_index :: I
-           ranks :: ρ
-    connectivity :: C
+struct MultiCPU{G, R, I, ρ, C, γ} <: AbstractCPUArchitecture
+    distributed_grid :: G
+          local_rank :: R
+         local_index :: I
+               ranks :: ρ
+        connectivity :: C
+        communicator :: γ
 end
 
-struct MultiGPU{G, R, I, ρ, C} <: AbstractGPUArchitecture
-       full_grid :: G
-         my_rank :: R
-        my_index :: I
-           ranks :: ρ
-    connectivity :: C
+struct MultiGPU{G, R, I, ρ, C, γ} <: AbstractGPUArchitecture
+    distributed_grid :: G
+          local_rank :: R
+         local_index :: I
+               ranks :: ρ
+        connectivity :: C
+        communicator :: γ
 end
 
 const AbstractMultiArchitecture = Union{MultiCPU, MultiGPU}
@@ -109,7 +111,7 @@ end
 ##### Constructors
 #####
 
-function MultiCPU(; grid, ranks)
+function MultiCPU(; grid, ranks, communicator=MPI.COMM_WORLD)
     MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
 
     validate_tupled_argument(ranks, Int, "ranks")
@@ -117,21 +119,19 @@ function MultiCPU(; grid, ranks)
     Rx, Ry, Rz = ranks
     total_ranks = Rx*Ry*Rz
 
-    comm = MPI.COMM_WORLD
+    mpi_ranks = MPI.Comm_size(communicator)
+    local_rank   = MPI.Comm_rank(communicator)
 
-    mpi_ranks = MPI.Comm_size(comm)
-    my_rank   = MPI.Comm_rank(comm)
-
-    i, j, k = my_index = rank2index(my_rank, Rx, Ry, Rz)
+    i, j, k = local_index = rank2index(local_rank, Rx, Ry, Rz)
 
     if total_ranks != mpi_ranks
         throw(ArgumentError("ranks=($Rx, $Ry, $Rz) [$total_ranks total] inconsistent " *
                             "with number of MPI ranks: $mpi_ranks."))
     end
 
-    my_connectivity = RankConnectivity(my_index, ranks, topology(grid))
+    local_connectivity = RankConnectivity(local_index, ranks, topology(grid))
 
-    return MultiCPU(grid, my_rank, my_index, ranks, my_connectivity)
+    return MultiCPU(grid, local_rank, local_index, ranks, local_connectivity, communicator)
 end
 
 #####
@@ -140,7 +140,7 @@ end
 
 function Base.show(io::IO, arch::MultiCPU)
     c = arch.connectivity
-    print(io, "MultiCPU architecture (rank $(arch.my_rank)/$(prod(arch.ranks))) [index $(arch.my_index) / $(arch.ranks)]\n",
+    print(io, "MultiCPU architecture (rank $(arch.local_rank)/$(prod(arch.ranks))) [index $(arch.local_index) / $(arch.ranks)]\n",
               "└── connectivity:",
               isnothing(c.east) ? "" : " east=$(c.east)",
               isnothing(c.west) ? "" : " west=$(c.west)",
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 81b31d5e34..dded3c090a 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -17,8 +17,8 @@ function test_triply_periodic_rank_connectivity_with_411_ranks()
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test local_rank == index2rank(arch.local_index..., arch.ranks...)
 
     connectivity = arch.connectivity
 
@@ -32,16 +32,16 @@ function test_triply_periodic_rank_connectivity_with_411_ranks()
     # | 0 | 1 | 2 | 3 |
     # +---+---+---+---+
 
-    if my_rank == 0
+    if local_rank == 0
         @test connectivity.east == 1
         @test connectivity.west == 3
-    elseif my_rank == 1
+    elseif local_rank == 1
         @test connectivity.east == 2
         @test connectivity.west == 0
-    elseif my_rank == 2
+    elseif local_rank == 2
         @test connectivity.east == 3
         @test connectivity.west == 1
-    elseif my_rank == 3
+    elseif local_rank == 3
         @test connectivity.east == 0
         @test connectivity.west == 2
     end
@@ -54,8 +54,8 @@ function test_triply_periodic_rank_connectivity_with_141_ranks()
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test local_rank == index2rank(arch.local_index..., arch.ranks...)
 
     connectivity = arch.connectivity
 
@@ -75,16 +75,16 @@ function test_triply_periodic_rank_connectivity_with_141_ranks()
     # | 0 |
     # +---+
 
-    if my_rank == 0
+    if local_rank == 0
         @test connectivity.north == 1
         @test connectivity.south == 3
-    elseif my_rank == 1
+    elseif local_rank == 1
         @test connectivity.north == 2
         @test connectivity.south == 0
-    elseif my_rank == 2
+    elseif local_rank == 2
         @test connectivity.north == 3
         @test connectivity.south == 1
-    elseif my_rank == 3
+    elseif local_rank == 3
         @test connectivity.north == 0
         @test connectivity.south == 2
     end
@@ -97,8 +97,8 @@ function test_triply_periodic_rank_connectivity_with_114_ranks()
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test local_rank == index2rank(arch.local_index..., arch.ranks...)
 
     connectivity = arch.connectivity
 
@@ -121,16 +121,16 @@ function test_triply_periodic_rank_connectivity_with_114_ranks()
     #  / 0 /
     # /---/
 
-    if my_rank == 0
+    if local_rank == 0
         @test connectivity.top == 1
         @test connectivity.bottom == 3
-    elseif my_rank == 1
+    elseif local_rank == 1
         @test connectivity.top == 2
         @test connectivity.bottom == 0
-    elseif my_rank == 2
+    elseif local_rank == 2
         @test connectivity.top == 3
         @test connectivity.bottom == 1
-    elseif my_rank == 3
+    elseif local_rank == 3
         @test connectivity.top == 0
         @test connectivity.bottom == 2
     end
@@ -143,8 +143,8 @@ function test_triply_periodic_rank_connectivity_with_221_ranks()
     full_grid = RegularRectilinearGrid(topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test my_rank == index2rank(arch.my_index..., arch.ranks...)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    @test local_rank == index2rank(arch.local_index..., arch.ranks...)
 
     connectivity = arch.connectivity
 
@@ -158,22 +158,22 @@ function test_triply_periodic_rank_connectivity_with_221_ranks()
     # | 1 | 3 |
     # +---+---+
 
-    if my_rank == 0
+    if local_rank == 0
         @test connectivity.east == 2
         @test connectivity.west == 2
         @test connectivity.north == 1
         @test connectivity.south == 1
-    elseif my_rank == 1
+    elseif local_rank == 1
         @test connectivity.east == 3
         @test connectivity.west == 3
         @test connectivity.north == 0
         @test connectivity.south == 0
-    elseif my_rank == 2
+    elseif local_rank == 2
         @test connectivity.east == 0
         @test connectivity.west == 0
         @test connectivity.north == 3
         @test connectivity.south == 3
-    elseif my_rank == 3
+    elseif local_rank == 3
         @test connectivity.east == 1
         @test connectivity.west == 1
         @test connectivity.north == 2
@@ -193,12 +193,12 @@ function test_triply_periodic_local_grid_with_411_ranks()
     arch = MultiCPU(grid=full_grid, ranks=(4, 1, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
-    @test local_grid.xF[1] == 0.25*my_rank
-    @test local_grid.xF[nx+1] == 0.25*(my_rank+1)
+    @test local_grid.xF[1] == 0.25*local_rank
+    @test local_grid.xF[nx+1] == 0.25*(local_rank+1)
     @test local_grid.yF[1] == 0
     @test local_grid.yF[ny+1] == 2
     @test local_grid.zF[1] == -3
@@ -213,14 +213,14 @@ function test_triply_periodic_local_grid_with_141_ranks()
     arch = MultiCPU(grid=full_grid, ranks=(1, 4, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
     @test local_grid.xF[1] == 0
     @test local_grid.xF[nx+1] == 1
-    @test local_grid.yF[1] == 0.5*my_rank
-    @test local_grid.yF[ny+1] == 0.5*(my_rank+1)
+    @test local_grid.yF[1] == 0.5*local_rank
+    @test local_grid.yF[ny+1] == 0.5*(local_rank+1)
     @test local_grid.zF[1] == -3
     @test local_grid.zF[nz+1] == 0
 
@@ -233,7 +233,7 @@ function test_triply_periodic_local_grid_with_114_ranks()
     arch = MultiCPU(grid=full_grid, ranks=(1, 1, 4))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    my_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
@@ -241,8 +241,8 @@ function test_triply_periodic_local_grid_with_114_ranks()
     @test local_grid.xF[nx+1] == 1
     @test local_grid.yF[1] == 0
     @test local_grid.yF[ny+1] == 2
-    @test local_grid.zF[1] == -3 + 0.75*my_rank
-    @test local_grid.zF[nz+1] == -3 + 0.75*(my_rank+1)
+    @test local_grid.zF[1] == -3 + 0.75*local_rank
+    @test local_grid.zF[nz+1] == -3 + 0.75*(local_rank+1)
 
     return nothing
 end
@@ -253,7 +253,7 @@ function test_triply_periodic_local_grid_with_221_ranks()
     arch = MultiCPU(grid=full_grid, ranks=(2, 2, 1))
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
-    i, j, k = arch.my_index
+    i, j, k = arch.local_index
     local_grid = model.grid
     nx, ny, nz = size(local_grid)
 
@@ -350,17 +350,17 @@ function test_triply_periodic_halo_communication_with_411_ranks(halo)
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in merge(fields(model), model.pressures)
-        interior(field) .= arch.my_rank
+        interior(field) .= arch.local_rank
         fill_halo_regions!(field, arch)
 
         @test all(east_halo(field) .== arch.connectivity.east)
         @test all(west_halo(field) .== arch.connectivity.west)
 
-        @test all(interior(field) .== arch.my_rank)
-        @test all(north_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(south_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(interior(field) .== arch.local_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
@@ -373,17 +373,17 @@ function test_triply_periodic_halo_communication_with_141_ranks(halo)
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in merge(fields(model), model.pressures)
-        interior(field) .= arch.my_rank
+        interior(field) .= arch.local_rank
         fill_halo_regions!(field, arch)
 
         @test all(north_halo(field) .== arch.connectivity.north)
         @test all(south_halo(field) .== arch.connectivity.south)
 
-        @test all(interior(field) .== arch.my_rank)
-        @test all(east_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(west_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(interior(field) .== arch.local_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
@@ -396,17 +396,17 @@ function test_triply_periodic_halo_communication_with_114_ranks(halo)
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in merge(fields(model), model.pressures)
-        interior(field) .= arch.my_rank
+        interior(field) .= arch.local_rank
         fill_halo_regions!(field, arch)
 
         @test all(top_halo(field) .== arch.connectivity.top)
         @test all(bottom_halo(field) .== arch.connectivity.bottom)
 
-        @test all(interior(field) .== arch.my_rank)
-        @test all(east_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(west_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(north_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(south_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(interior(field) .== arch.local_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
@@ -419,7 +419,7 @@ function test_triply_periodic_halo_communication_with_221_ranks(halo)
     model = DistributedIncompressibleModel(architecture=arch, grid=full_grid, pressure_solver=nothing)
 
     for field in merge(fields(model), model.pressures)
-        interior(field) .= arch.my_rank
+        interior(field) .= arch.local_rank
         fill_halo_regions!(field, arch)
 
         @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
@@ -427,9 +427,9 @@ function test_triply_periodic_halo_communication_with_221_ranks(halo)
         @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
         @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
 
-        @test all(interior(field) .== arch.my_rank)
-        @test all(top_halo(field, include_corners=false) .== arch.my_rank)
-        @test all(bottom_halo(field, include_corners=false) .== arch.my_rank)
+        @test all(interior(field) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 7726f5cd94..82e6d7c011 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -50,7 +50,7 @@ function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     solve_poisson_equation!(solver)
 
     p_bcs = PressureBoundaryConditions(local_grid)
-    p_bcs = inject_halo_communication_boundary_conditions(p_bcs, arch.my_rank, arch.connectivity)
+    p_bcs = inject_halo_communication_boundary_conditions(p_bcs, arch.local_rank, arch.connectivity)
 
     ϕ   = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)  # "pressure"
     ∇²ϕ = CenterField(Float64, child_architecture(arch), local_grid, p_bcs)
@@ -64,5 +64,5 @@ end
 @testset "Distributed FFT-based Poisson solver" begin
     @info "  Testing distributed FFT-based Poisson solver..."
     @test divergence_free_poisson_solution_triply_periodic((16, 16, 1), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((64, 64, 1), (1, 4, 1))
+    @test divergence_free_poisson_solution_triply_periodic((44, 44, 1), (1, 4, 1))
 end

From 447e121c404374593cea053096dabae3ce86f1ea Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 22:03:19 -0500
Subject: [PATCH 097/100] Left to right

---
 .../distributed_incompressible_model.jl       |  1 +
 src/Distributed/halo_communication.jl         | 30 +++++++++----------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/Distributed/distributed_incompressible_model.jl b/src/Distributed/distributed_incompressible_model.jl
index 2379586113..87d4c337ff 100644
--- a/src/Distributed/distributed_incompressible_model.jl
+++ b/src/Distributed/distributed_incompressible_model.jl
@@ -23,6 +23,7 @@ function DistributedIncompressibleModel(; architecture, grid, model_kwargs...)
 
     # Make sure we can put an integer number of grid points in each rank.
     # Will generalize in the future.
+    # TODO: Check that we have enough grid points on each rank to fit the halos!
     @assert isinteger(Nx / Rx)
     @assert isinteger(Ny / Ry)
     @assert isinteger(Nz / Rz)
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 95e880428d..b14daa6d6d 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -13,9 +13,9 @@ sides  = (:west, :east, :south, :north, :top, :bottom)
 side_id = Dict(side => n for (n, side) in enumerate(sides))
 
 opposite_side = Dict(
-    :east => :west, :west => :east,
-    :north => :south, :south => :north,
-    :top => :bottom, :bottom => :top
+    :west => :east, :east => :west,
+    :south => :north, :north => :south,
+    :bottom => :top, :top => :bottom
 )
 
 # Define functions that return unique send and recv MPI tags for each side.
@@ -58,11 +58,11 @@ function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitectu
 
     barrier = Event(device(child_architecture(arch)))
 
-    east_event, west_event = fill_east_and_west_halos!(c, bcs.east, bcs.west, arch, barrier, grid, c_location, args...)
-    north_event, south_event = fill_north_and_south_halos!(c, bcs.north, bcs.south, arch, barrier, grid, c_location, args...)
-    top_event, bottom_event = fill_top_and_bottom_halos!(c, bcs.top, bcs.bottom, arch, barrier, grid, c_location, args...)
+    west_event, east_event = fill_west_and_east_halos!(c, bcs.west, bcs.east, arch, barrier, grid, c_location, args...)
+    south_event, north_event = fill_south_and_north_halos!(c, bcs.south, bcs.north, arch, barrier, grid, c_location, args...)
+    bottom_event, top_event = fill_bottom_and_top_halos!(c, bcs.bottom, bcs.top, arch, barrier, grid, c_location, args...)
 
-    events = [east_event, west_event, north_event, south_event, top_event, bottom_event]
+    events = [west_event, east_event, south_event, north_event, bottom_event, top_event]
     events = filter(e -> e isa Event, events)
     wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
 
@@ -70,12 +70,12 @@ function fill_halo_regions!(c::AbstractArray, bcs, arch::AbstractMultiArchitectu
 end
 
 #####
-##### fill_east_and_west_halos!   }
-##### fill_north_and_south_halos! } for non-communicating boundary conditions (fallback)
-##### fill_top_and_bottom_halos!  }
+##### fill_west_and_east_halos!   }
+##### fill_south_and_north_halos! } for non-communicating boundary conditions (fallback)
+##### fill_bottom_and_top_halos!  }
 #####
 
-for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom])
+for (side, opposite_side) in zip([:west, :south, :bottom], [:east, :north, :top])
     fill_both_halos! = Symbol("fill_$(side)_and_$(opposite_side)_halos!")
     fill_side_halo! = Symbol("fill_$(side)_halo!")
     fill_opposite_side_halo! = Symbol("fill_$(opposite_side)_halo!")
@@ -90,12 +90,12 @@ for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom]
 end
 
 #####
-##### fill_east_and_west_halos!   }
-##### fill_north_and_south_halos! } for when both halos are communicative
-##### fill_top_and_bottom_halos!  }
+##### fill_west_and_east_halos!   }
+##### fill_south_and_north_halos! } for when both halos are communicative
+##### fill_bottom_and_top_halos!  }
 #####
 
-for (side, opposite_side) in zip([:east, :north, :top], [:west, :south, :bottom])
+for (side, opposite_side) in zip([:west, :south, :bottom], [:east, :north, :top])
     fill_both_halos! = Symbol("fill_$(side)_and_$(opposite_side)_halos!")
     send_side_halo = Symbol("send_$(side)_halo")
     send_opposite_side_halo = Symbol("send_$(opposite_side)_halo")

From 3992e473858a4f3c85d914fd2f23fd147fd623de Mon Sep 17 00:00:00 2001
From: "ali.hh.ramadhan@gmail.com" <ali.hh.ramadhan@gmail.com>
Date: Tue, 9 Mar 2021 23:32:39 -0500
Subject: [PATCH 098/100] Strong scaling benchmark for incompressible model

---
 benchmark/Manifest.toml                       | 63 +++++++++++++------
 benchmark/Project.toml                        |  1 +
 .../strong_scaling_incompressible_model.jl    | 39 ++++++++++++
 ...ong_scaling_incompressible_model_single.jl | 40 ++++++++++++
 4 files changed, 123 insertions(+), 20 deletions(-)
 create mode 100644 benchmark/strong_scaling_incompressible_model.jl
 create mode 100644 benchmark/strong_scaling_incompressible_model_single.jl

diff --git a/benchmark/Manifest.toml b/benchmark/Manifest.toml
index 11edda056a..f40874ac4e 100644
--- a/benchmark/Manifest.toml
+++ b/benchmark/Manifest.toml
@@ -2,15 +2,14 @@
 
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "8ed9de2f1b1a9b1dee48582ad477c6e67b83eb2c"
+git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.0.0"
+version = "1.0.1"
 
 [[AbstractTrees]]
-deps = ["Markdown"]
-git-tree-sha1 = "33e450545eaf7699da1a6e755f9ea65f14077a45"
+git-tree-sha1 = "03e0550477d86222521d254b741d470ba17ea0b5"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.3"
+version = "0.3.4"
 
 [[Adapt]]
 deps = ["LinearAlgebra"]
@@ -31,9 +30,9 @@ uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 version = "0.1.0"
 
 [[BSON]]
-git-tree-sha1 = "2878972c4bc17d9c8d26d48d9ef00fcfe1899e7a"
+git-tree-sha1 = "db18b5ea04686f73d269e10bdb241947c40d7d6f"
 uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
-version = "0.3.0"
+version = "0.3.2"
 
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -67,6 +66,12 @@ git-tree-sha1 = "de4f08843c332d355852721adb1592bce7924da3"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "0.9.29"
 
+[[CodecZlib]]
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
+uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
+version = "0.7.0"
+
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
 git-tree-sha1 = "919c7f3151e79ff196add81d7f4e45d91bbf420b"
@@ -178,6 +183,12 @@ git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
 uuid = "82899510-4779-5014-852e-03e436cf321d"
 version = "1.0.0"
 
+[[JLD2]]
+deps = ["CodecZlib", "DataStructures", "MacroTools", "Mmap", "Pkg", "Printf", "Requires", "UUIDs"]
+git-tree-sha1 = "b8343a7f96591404ade118b3a7014e1a52062465"
+uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+version = "0.4.2"
+
 [[JLLWrappers]]
 git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
@@ -248,9 +259,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[NNlib]]
 deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Pkg", "Requires", "Statistics"]
-git-tree-sha1 = "df42d0816edfc24f5b82a728f46381613c4dff79"
+git-tree-sha1 = "5ce2e4b2bfe3811811e7db4b6a148439806fd2f8"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.7.14"
+version = "0.7.16"
 
 [[OrderedCollections]]
 git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf"
@@ -259,9 +270,9 @@ version = "1.4.0"
 
 [[Parsers]]
 deps = ["Dates"]
-git-tree-sha1 = "50c9a9ed8c714945e01cd53a21007ed3865ed714"
+git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "1.0.15"
+version = "1.0.16"
 
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
@@ -274,10 +285,10 @@ uuid = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d"
 version = "0.2.10"
 
 [[PooledArrays]]
-deps = ["DataAPI"]
-git-tree-sha1 = "0e8f5c428a41a81cd71f76d76f2fc3415fe5a676"
+deps = ["DataAPI", "Future"]
+git-tree-sha1 = "cde4ce9d6f33219465b55162811d8de8139c0414"
 uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
-version = "1.1.0"
+version = "1.2.1"
 
 [[PrettyTables]]
 deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"]
@@ -310,9 +321,9 @@ version = "1.0.0"
 
 [[Requires]]
 deps = ["UUIDs"]
-git-tree-sha1 = "cfbac6c1ed70c002ec6361e7fd334f02820d6419"
+git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.1.2"
+version = "1.1.3"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -367,9 +378,9 @@ version = "1.0.0"
 
 [[Tables]]
 deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"]
-git-tree-sha1 = "a716dde43d57fa537a19058d044b495301ba6565"
+git-tree-sha1 = "f03fc113290ee7726b173fc7ea661260d204b3f2"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.3.2"
+version = "1.4.0"
 
 [[TerminalLoggers]]
 deps = ["LeftChildRightSiblingTrees", "Logging", "Markdown", "Printf", "ProgressLogging", "UUIDs"]
@@ -383,9 +394,15 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TimerOutputs]]
 deps = ["Printf"]
-git-tree-sha1 = "3318281dd4121ecf9713ce1383b9ace7d7476fdd"
+git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.7"
+version = "0.5.8"
+
+[[TranscodingStreams]]
+deps = ["Random", "Test"]
+git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
+uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
+version = "0.9.5"
 
 [[UUIDs]]
 deps = ["Random", "SHA"]
@@ -393,3 +410,9 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[Zlib_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "320228915c8debb12cb434c59057290f0834dbf6"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+18"
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 4861b22f9e..86bb462f92 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -8,6 +8,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 PkgBenchmark = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
diff --git a/benchmark/strong_scaling_incompressible_model.jl b/benchmark/strong_scaling_incompressible_model.jl
new file mode 100644
index 0000000000..7ae1d8cb45
--- /dev/null
+++ b/benchmark/strong_scaling_incompressible_model.jl
@@ -0,0 +1,39 @@
+using JLD2
+using BenchmarkTools
+using Benchmarks
+
+# Benchmark parameters
+
+Nx = 128
+Ny = 128
+Nz = Nx
+
+ranks = (1, 2, 4)
+
+# Run and collect benchmarks
+
+print_system_info()
+
+for r in ranks
+    @info "Benchmarking distributed incompressible model strong scaling [N=($Nx, $Ny, $Nz), ranks=$r]..."
+    julia = Base.julia_cmd()
+    run(`mpiexec -np $r $julia --project strong_scaling_incompressible_model_single.jl $Nx $Ny $Nz`)
+end
+
+suite = BenchmarkGroup(["size", "ranks"])
+for r in ranks
+    jldopen("strong_scaling_incompressible_model_$r.jld2", "r") do file
+        suite[((Nx, Ny, Nz), r)] = file["trial"]
+    end
+end
+
+# Summarize benchmarks
+
+df = benchmarks_dataframe(suite)
+sort!(df, :ranks)
+benchmarks_pretty_table(df, title="Incompressible model strong scaling benchmark")
+
+suite_Δ = speedups_suite(suite, base_case=((Nx, Ny, Nz), 1))
+df_Δ = speedups_dataframe(suite_Δ)
+sort!(df_Δ, :ranks)
+benchmarks_pretty_table(df_Δ, title="Incompressible model strong scaling speedup")
diff --git a/benchmark/strong_scaling_incompressible_model_single.jl b/benchmark/strong_scaling_incompressible_model_single.jl
new file mode 100644
index 0000000000..28ab30e60d
--- /dev/null
+++ b/benchmark/strong_scaling_incompressible_model_single.jl
@@ -0,0 +1,40 @@
+using Logging
+using MPI
+using JLD2
+using BenchmarkTools
+
+using Oceananigans
+using Oceananigans.Distributed
+using Benchmarks
+
+Logging.global_logger(OceananigansLogger())
+
+MPI.Init()
+comm = MPI.COMM_WORLD
+
+R = MPI.Comm_size(comm)
+
+Nx = parse(Int, ARGS[1])
+Ny = parse(Int, ARGS[2])
+Nz = parse(Int, ARGS[3])
+
+@info "Setting up distributed incompressible model with N=($Nx, $Ny, $Nz) grid points on $R rank(s)..."
+
+topo = (Periodic, Periodic, Periodic)
+distributed_grid = RegularRectilinearGrid(topology=topo, size=(Nx, Ny, Nz), extent=(1, 1, 1))
+arch = MultiCPU(grid=distributed_grid, ranks=(1, R, 1))
+model = DistributedIncompressibleModel(architecture=arch, grid=distributed_grid)
+
+@info "Warming up distributed incompressible model..."
+
+time_step!(model, 1) # warmup
+
+@info "Benchmarking distributed incompressible model..."
+
+trial = @benchmark begin
+    @sync_gpu time_step!($model, 1)
+end samples=10
+
+jldopen("strong_scaling_incompressible_model_$R.jld2", "w") do file
+    file["trial"] = trial
+end

From b3a75b8b94f0c068fdad147f691fa58f9afd440a Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 10 Mar 2021 01:18:14 -0500
Subject: [PATCH 099/100] Slightly better strong scaling benchmark

---
 benchmark/strong_scaling_incompressible_model.jl        | 6 +++---
 benchmark/strong_scaling_incompressible_model_single.jl | 7 ++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/benchmark/strong_scaling_incompressible_model.jl b/benchmark/strong_scaling_incompressible_model.jl
index 7ae1d8cb45..3b24cb272a 100644
--- a/benchmark/strong_scaling_incompressible_model.jl
+++ b/benchmark/strong_scaling_incompressible_model.jl
@@ -4,11 +4,11 @@ using Benchmarks
 
 # Benchmark parameters
 
-Nx = 128
-Ny = 128
+Nx = 256
+Ny = 256
 Nz = Nx
 
-ranks = (1, 2, 4)
+ranks = (1, 2, 4, 8, 16)
 
 # Run and collect benchmarks
 
diff --git a/benchmark/strong_scaling_incompressible_model_single.jl b/benchmark/strong_scaling_incompressible_model_single.jl
index 28ab30e60d..9e5d3aa4c5 100644
--- a/benchmark/strong_scaling_incompressible_model_single.jl
+++ b/benchmark/strong_scaling_incompressible_model_single.jl
@@ -12,6 +12,7 @@ Logging.global_logger(OceananigansLogger())
 MPI.Init()
 comm = MPI.COMM_WORLD
 
+local_rank = MPI.Comm_rank(comm)
 R = MPI.Comm_size(comm)
 
 Nx = parse(Int, ARGS[1])
@@ -33,8 +34,12 @@ time_step!(model, 1) # warmup
 
 trial = @benchmark begin
     @sync_gpu time_step!($model, 1)
+    MPI.Barrier(comm)
 end samples=10
 
-jldopen("strong_scaling_incompressible_model_$R.jld2", "w") do file
+@info "Rank $local_rank is done benchmarking!"
+
+jldopen("strong_scaling_incompressible_model_$(R)_$local_rank.jld2", "w") do file
     file["trial"] = trial
 end
+

From ab3e539789119f91f0ead1b7ea610e89dc527ef6 Mon Sep 17 00:00:00 2001
From: ali-ramadhan <ali.hh.ramadhan@gmail.com>
Date: Wed, 10 Mar 2021 01:20:20 -0500
Subject: [PATCH 100/100] Update [compat] entries

---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index 855bc1e7d2..d9e42a09d0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -35,9 +35,11 @@ FFTW = "^1"
 Glob = "1.3"
 JLD2 = "^0.2, ^0.3, 0.4"
 KernelAbstractions = "^0.3, 0.4, 0.5"
+MPI = "0.16"
 NCDatasets = "^0.10, ^0.11"
 OffsetArrays = "^1.4"
 OrderedCollections = "^1.1"
+PencilFFTs = "0.12"
 SafeTestsets = "0.0.1"
 SeawaterPolynomials = "^0.2"
 StructArrays = "0.4, 0.5"