diff --git a/CHIPYARD.hash b/CHIPYARD.hash
index 747b6b18..cd133513 100644
--- a/CHIPYARD.hash
+++ b/CHIPYARD.hash
@@ -1 +1 @@
-bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89
+f86707bc95d7e95828e63d70ff28fbdaa76a884e
diff --git a/README.md b/README.md
index 4caf35bb..3e41d6a5 100644
--- a/README.md
+++ b/README.md
@@ -24,41 +24,26 @@ Dependencies
 
 Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#default-requirements-installation).
 
-Installing Chipyard and Spike
+Installing Gemmini
 -----------------------------
 
-Run these steps to install Chipyard and Spike (make sure to checkout the correct Chipyard and Spike commits as shown below):
+Run these steps:
 
 ```shell
 git clone https://github.com/ucb-bar/chipyard.git
 cd chipyard
-git checkout 1.8.1
+git checkout gemmini-fp8-exploration-2
 ./build-setup.sh riscv-tools
 
 source env.sh
 
 cd generators/gemmini
 git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
-git checkout dev && git pull origin dev
+git checkout fp8-recoding && git pull origin fp8-recoding
 git submodule update --init --recursive
 
 make -C software/libgemmini install
 
-# The final step is only necessary if you want to run MIDAS simulations with
-# realistic DRAM models
-cd -
-cd sims/firesim
-source sourceme-f1-manager.sh --skip-ssh-setup # Ignore error messages from this command
-./build-setup.sh --library --skip-validate
-```
-
-Setting Up Gemmini
-------------------
-
-Run the steps below to set up Gemmini configuration files, symlinks, and subdirectories:
-
-```shell
-cd chipyard/generators/gemmini
 ./scripts/setup-paths.sh
 ```
 
diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests
index bc3cabb4..f9fd1c05 160000
--- a/software/gemmini-rocc-tests
+++ b/software/gemmini-rocc-tests
@@ -1 +1 @@
-Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927
+Subproject commit f9fd1c05640d561bc47a192a6aaccd503b5b0705
diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala
index c6792578..c8f75387 100644
--- a/src/main/scala/gemmini/Arithmetic.scala
+++ b/src/main/scala/gemmini/Arithmetic.scala
@@ -8,8 +8,8 @@ import chisel3.util._
 import hardfloat._
 
 // Bundles that represent the raw bits of custom datatypes
-case class Float(expWidth: Int, sigWidth: Int) extends Bundle {
-  val bits = UInt((expWidth + sigWidth).W)
+case class Float(expWidth: Int, sigWidth: Int, isRecoded: Boolean = false) extends Bundle {
+  val bits = UInt((expWidth + sigWidth + (if (isRecoded) 1 else 0)).W)
 
   val bias: Int = (1 << (expWidth-1)) - 1
 }
@@ -245,7 +245,7 @@ object Arithmetic {
       }
 
       override def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match {
-        case Float(expWidth, sigWidth) =>
+        case Float(expWidth, sigWidth, false) =>
           val input = Wire(Decoupled(UInt(0.W)))
           val output = Wire(Decoupled(u.cloneType))
 
@@ -287,7 +287,7 @@ object Arithmetic {
       }
 
       override def mult_with_reciprocal[U <: Data](reciprocal: U): SInt = reciprocal match {
-        case recip @ Float(expWidth, sigWidth) =>
+        case recip @ Float(expWidth, sigWidth, false) =>
           def in_to_float(x: SInt) = {
             val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth))
             in_to_rec_fn.io.signedIn := true.B
@@ -328,12 +328,10 @@ object Arithmetic {
   }
 
   implicit object FloatArithmetic extends Arithmetic[Float] {
-    // TODO Floating point arithmetic currently switches between recoded and standard formats for every operation. However, it should stay in the recoded format as it travels through the systolic array
-
     override implicit def cast(self: Float): ArithmeticOps[Float] = new ArithmeticOps(self) {
       override def *(t: Float): Float = {
-        val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits)
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         val t_resizer =  Module(new RecFNToRecFN(t.expWidth, t.sigWidth, self.expWidth, self.sigWidth))
         t_resizer.io.in := t_rec
@@ -351,16 +349,16 @@ object Arithmetic {
         muladder.io.b := t_rec_resized
         muladder.io.c := 0.U
 
-        val out = Wire(Float(self.expWidth, self.sigWidth))
-        out.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)
+        val out = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded))
+        out.bits := (if (out.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out))
         out
       }
 
       override def mac(m1: Float, m2: Float): Float = {
         // Recode all operands
-        val m1_rec = recFNFromFN(m1.expWidth, m1.sigWidth, m1.bits)
-        val m2_rec = recFNFromFN(m2.expWidth, m2.sigWidth, m2.bits)
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val m1_rec = if (m1.isRecoded) m1.bits else recFNFromFN(m1.expWidth, m1.sigWidth, m1.bits)
+        val m2_rec = if (m2.isRecoded) m2.bits else recFNFromFN(m2.expWidth, m2.sigWidth, m2.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         // Resize m1 to self's width
         val m1_resizer = Module(new RecFNToRecFN(m1.expWidth, m1.sigWidth, self.expWidth, self.sigWidth))
@@ -388,8 +386,8 @@ object Arithmetic {
         muladder.io.c := self_rec
 
         // Convert result to standard format // TODO remove these intermediate recodings
-        val out = Wire(Float(self.expWidth, self.sigWidth))
-        out.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)
+        val out = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded))
+        out.bits := (if (out.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out))
         out
       }
 
@@ -397,8 +395,8 @@ object Arithmetic {
         require(self.getWidth >= t.getWidth) // This just makes it easier to write the resizing code
 
         // Recode all operands
-        val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits)
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         // Generate 1 as a float
         val in_to_rec_fn = Module(new INToRecFN(1, self.expWidth, self.sigWidth))
@@ -427,8 +425,8 @@ object Arithmetic {
         muladder.io.b := one_rec
         muladder.io.c := self_rec
 
-        val result = Wire(Float(self.expWidth, self.sigWidth))
-        result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)
+        val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded))
+        result.bits := (if (result.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out))
         result
       }
 
@@ -440,7 +438,7 @@ object Arithmetic {
 
       override def >>(u: UInt): Float = {
         // Recode self
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         // Get 2^(-u) as a recoded float
         val shift_exp = Wire(UInt(self.expWidth.W))
@@ -461,15 +459,15 @@ object Arithmetic {
         muladder.io.b := shift_rec
         muladder.io.c := 0.U
 
-        val result = Wire(Float(self.expWidth, self.sigWidth))
-        result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)
+        val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded))
+        result.bits := (if (result.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out))
         result
       }
 
       override def >(t: Float): Bool = {
         // Recode all operands
-        val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits)
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         // Resize t to self's width
         val t_resizer = Module(new RecFNToRecFN(t.expWidth, t.sigWidth, self.expWidth, self.sigWidth))
@@ -487,43 +485,49 @@ object Arithmetic {
       }
 
       override def withWidthOf(t: Float): Float = {
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         val resizer = Module(new RecFNToRecFN(self.expWidth, self.sigWidth, t.expWidth, t.sigWidth))
         resizer.io.in := self_rec
         resizer.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
         resizer.io.detectTininess := consts.tininess_afterRounding
 
-        val result = Wire(Float(t.expWidth, t.sigWidth))
-        result.bits := fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out)
+        val result = Wire(Float(t.expWidth, t.sigWidth, t.isRecoded))
+        result.bits := (if (result.isRecoded) resizer.io.out else fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out))
         result
       }
 
       override def clippedToWidthOf(t: Float): Float = {
         // TODO check for overflow. Right now, we just assume that overflow doesn't happen
-        val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits)
+        val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits)
 
         val resizer = Module(new RecFNToRecFN(self.expWidth, self.sigWidth, t.expWidth, t.sigWidth))
         resizer.io.in := self_rec
         resizer.io.roundingMode := consts.round_near_even // consts.round_near_maxMag
         resizer.io.detectTininess := consts.tininess_afterRounding
 
-        val result = Wire(Float(t.expWidth, t.sigWidth))
-        result.bits := fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out)
+        val result = Wire(Float(t.expWidth, t.sigWidth, t.isRecoded))
+        result.bits := (if (result.isRecoded) resizer.io.out else fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out))
         result
       }
 
       override def relu: Float = {
-        val raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits)
+        val raw = if (self.isRecoded) rawFloatFromRecFN(self.expWidth, self.sigWidth, self.bits) else rawFloatFromFN(self.expWidth, self.sigWidth, self.bits)
 
-        val result = Wire(Float(self.expWidth, self.sigWidth))
+        val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded))
         result.bits := Mux(!raw.isZero && raw.sign, 0.U, self.bits)
         result
       }
 
       override def zero: Float = 0.U.asTypeOf(self)
-      override def identity: Float = Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
-      override def minimum: Float = Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
+      override def identity: Float = {
+        require(!self.isRecoded)
+        Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
+      }
+      override def minimum: Float = {
+        require(!self.isRecoded)
+        Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self)
+      }
     }
   }
 
diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala
index bd84b317..92155fb7 100644
--- a/src/main/scala/gemmini/Configs.scala
+++ b/src/main/scala/gemmini/Configs.scala
@@ -10,7 +10,6 @@ import freechips.rocketchip.rocket._
 import freechips.rocketchip.tile._
 import freechips.rocketchip.system._
 import freechips.rocketchip.diplomacy._
-
 import gemmini.Arithmetic.SIntArithmetic
 import hardfloat._
 
@@ -22,8 +21,11 @@ object GemminiConfigs {
   val defaultConfig = GemminiArrayConfig[SInt, Float, Float](
     // Datatypes
     inputType = SInt(8.W),
+    weightType = SInt(8.W),
     accType = SInt(32.W),
 
+    spatialArrayInputType = SInt(8.W),
+    spatialArrayWeightType = SInt(8.W),
     spatialArrayOutputType = SInt(20.W),
 
     // Spatial array size options
@@ -166,7 +168,10 @@ object GemminiConfigs {
 
   val dummyConfig = GemminiArrayConfig[DummySInt, Float, Float](
     inputType = DummySInt(8),
+    weightType = DummySInt(8),
     accType = DummySInt(32),
+    spatialArrayInputType = DummySInt(8),
+    spatialArrayWeightType = DummySInt(8),
     spatialArrayOutputType = DummySInt(20),
     tileRows     = defaultConfig.tileRows,
     tileColumns  = defaultConfig.tileColumns,
diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
index c76907dd..ef5ce85d 100644
--- a/src/main/scala/gemmini/ConfigsFP.scala
+++ b/src/main/scala/gemmini/ConfigsFP.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chisel3._
@@ -48,10 +49,15 @@ object GemminiFPConfigs {
     max_in_flight_mem_reqs = 16,
     use_dedicated_tl_port = false,
     use_shared_ext_mem = false,
+
     inputType = Float(8, 24),
-    spatialArrayOutputType = Float(8, 24),
+    weightType = Float(8, 24),
     accType = Float(8, 24),
 
+    spatialArrayInputType = Float(8, 24),
+    spatialArrayWeightType = Float(8, 24),
+    spatialArrayOutputType = Float(8, 24),
+
     mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
     mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
     mvin_scale_shared = false,
@@ -81,34 +87,67 @@ object GemminiFPConfigs {
   )
   
   //FP32 Single Precision Configuration
-  val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24),
+  val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), weightType = Float(8, 24), accType = Float(8, 24),
+                                               spatialArrayInputType = Float(8, 24), spatialArrayWeightType = Float(8, 24), spatialArrayOutputType = Float(8, 24),
                                                tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
  
   //FP16 Half Precision Configuration
-  val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24),
+  val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), weightType = Float(5, 11), accType = Float(8, 24),
+                                               spatialArrayInputType = Float(5, 11), spatialArrayWeightType = Float(5, 11), spatialArrayOutputType = Float(5, 11),
                                                tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
   
   //Bfloat16 Brain-half Precision Configuration
-  val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
+  val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), weightType = Float(8, 8), accType = Float(8, 24),
+                                               spatialArrayInputType = Float(8, 8), spatialArrayWeightType = Float(8, 8), spatialArrayOutputType = Float(8, 8),
                                                tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
 
   //Bfloat16 Brain-half Precision Configuration 8x8 array
-  val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24),
+  val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), weightType = Float(8, 8), accType = Float(8, 24),
+                                               spatialArrayInputType = Float(8, 8), spatialArrayWeightType = Float(8, 8), spatialArrayOutputType = Float(8, 8),
                                                meshRows = 8, meshColumns = 8,
                                                tile_latency = 2,
                                                mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
 
+  val FP8TransformerConfig = defaultFPConfig.copy(
+    inputType = Float(4, 4), weightType = Float(4, 4), accType = Float(8, 24),
+
+    spatialArrayInputType = Float(4, 4, isRecoded = true), spatialArrayWeightType = Float(4, 4, isRecoded = true), spatialArrayOutputType = Float(5, 11, isRecoded = true),
+
+    has_normalizations = false, // TODO this should be true
+    has_dw_convs = false, has_training_convs = false, has_max_pool = false, has_first_layer_optimizations = false, has_nonlinear_activations = false,
+
+    dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true,
+
+    meshRows = 16, meshColumns = 16,
+    acc_capacity = CapacityInKilobytes(128),
+
+    tile_latency = 2,
+  )
+
+  val FP8TransformerWithoutDenormalRecodingsConfig = FP8TransformerConfig.copy(
+    spatialArrayInputType = Float(4, 4, isRecoded = false), spatialArrayWeightType = Float(4, 4, isRecoded = false), spatialArrayOutputType = Float(5, 11, isRecoded = false),
+  )
+
+  val HybridFP8TransformerConfig = FP8TransformerConfig.copy(
+    weightType = Float(5, 3),
+    spatialArrayInputType = Float(5, 4, isRecoded = true), spatialArrayWeightType = Float(5, 4, isRecoded = true),
+    programmable_datatypes = true
+  )
+
+  val NonProgrammableHybridFP8TransformerConfig = HybridFP8TransformerConfig.copy(
+    programmable_datatypes = false
+  )
 }
 
 
@@ -171,3 +210,43 @@ class GemminiBF16Default8Config extends Config((site, here, up) => {
   )
 })
 
+// FP8 Transformer configs
+class GemminiFP8TransformerConfig extends Config((site, here, up) => {
+  case BuildRoCC => Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      implicit val v = implicitly[ValName]
+      LazyModule(new Gemmini(GemminiFPConfigs.FP8TransformerConfig))
+    }
+  )
+})
+
+class GemminiFP8TransformerNoRecodingConfig extends Config((site, here, up) => {
+  case BuildRoCC => Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      implicit val v = implicitly[ValName]
+      LazyModule(new Gemmini(GemminiFPConfigs.FP8TransformerWithoutDenormalRecodingsConfig))
+    }
+  )
+})
+
+class GemminiHybridFP8TransformerConfig extends Config((site, here, up) => {
+  case BuildRoCC => Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      implicit val v = implicitly[ValName]
+      LazyModule(new Gemmini(GemminiFPConfigs.HybridFP8TransformerConfig))
+    }
+  )
+})
+
+class GemminiNonProgrammableHybridFP8TransformerConfig extends Config((site, here, up) => {
+  case BuildRoCC => Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      implicit val v = implicitly[ValName]
+      LazyModule(new Gemmini(GemminiFPConfigs.NonProgrammableHybridFP8TransformerConfig))
+    }
+  )
+})
diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala
index ae529a69..4ff6e47e 100644
--- a/src/main/scala/gemmini/CustomConfigs.scala
+++ b/src/main/scala/gemmini/CustomConfigs.scala
@@ -1,3 +1,4 @@
+
 package gemmini
 
 import chipsalliance.rocketchip.config.{Config, Parameters}
@@ -42,15 +43,22 @@ object GemminiCustomConfigs {
   )
 
   val ibertInferenceConfig = defaultConfig.copy(
-    has_training_convs = false,
-    has_max_pool =  false,
+    // TODO change these back
+//    has_training_convs = false,
+//    has_max_pool =  false,
+//    has_normalizations = true,
+//
+//    acc_capacity = CapacityInKilobytes(128),
+
     has_normalizations = true,
+    has_dw_convs = false, has_training_convs = false, has_max_pool = false, has_first_layer_optimizations = false, has_nonlinear_activations = false,
+    dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true,
 
     acc_capacity = CapacityInKilobytes(128),
   )
 
   // Specify which of your custom configs you want to build here
-  val customConfig = baselineInferenceConfig
+  val customConfig = ibertInferenceConfig
 }
 
 
diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala
index 257721ca..7017ff8a 100644
--- a/src/main/scala/gemmini/DSEConfigs.scala
+++ b/src/main/scala/gemmini/DSEConfigs.scala
@@ -25,8 +25,7 @@ object DSEBaseConfig {
     reservation_station_entries_st = 4,
     reservation_station_entries_ex = 16,
 
-
-  sp_banks = 4, // TODO support one-bank designs
+    sp_banks = 4, // TODO support one-bank designs
     acc_banks = 1,
     acc_singleported = false,
     acc_latency = 2,
@@ -39,9 +38,15 @@ object DSEBaseConfig {
     dma_maxbytes = 128, // TODO get this from cacheblockbytes
     dma_buswidth = 128, // TODO get this from SystemBusKey
     aligned_to = 16,
+
     inputType = SInt(8.W),
-    spatialArrayOutputType = SInt(19.W),
+    weightType = SInt(8.W),
     accType = SInt(32.W),
+
+    spatialArrayInputType = SInt(8.W),
+    spatialArrayWeightType = SInt(8.W),
+    spatialArrayOutputType = SInt(19.W),
+
     mvin_scale_args = None,
     mvin_scale_acc_args = None,
     mvin_scale_shared = false,
diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala
index 65add720..bd36103d 100644
--- a/src/main/scala/gemmini/ExecuteController.scala
+++ b/src/main/scala/gemmini/ExecuteController.scala
@@ -118,6 +118,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val activation = if (has_nonlinear_activations) Reg(UInt(Activation.bitwidth.W)) else Activation.NONE // TODO magic number
   val a_transpose = Reg(Bool())
   val bd_transpose = Reg(Bool())
+  val datatypes = if (programmable_datatypes) Reg(Vec(2, Bool())) else VecInit(false.B, true.B)
   val config_initialized = RegInit(false.B)
 
   val a_should_be_fed_into_transposer = Mux(current_dataflow === Dataflow.OS.id.U, !a_transpose, a_transpose)
@@ -183,7 +184,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val cntl = mesh_cntl_signals_q.io.deq.bits
 
   // Instantiate the actual mesh
-  val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay,
+  val mesh = Module(new MeshWithDelays(spatialArrayInputType, spatialArrayWeightType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay,
     tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks))
 
   mesh.io.a.valid := false.B
@@ -556,6 +557,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
               a_transpose := config_ex_rs1.a_transpose
               bd_transpose := config_ex_rs1.b_transpose
 
+              if (programmable_datatypes) {
+                datatypes := config_ex_rs1.datatypes
+              }
+
               if (dataflow == Dataflow.BOTH) {
                 current_dataflow := config_ex_rs1.dataflow
               }
@@ -833,9 +838,9 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In
   val dataB_unpadded = MuxCase(readData(cntl.b_bank), Seq(cntl.accumulate_zeros -> 0.U, cntl.b_read_from_acc -> accReadData(cntl.b_bank_acc)))
   val dataD_unpadded = MuxCase(readData(cntl.d_bank), Seq(cntl.preload_zeros -> 0.U, cntl.d_read_from_acc -> accReadData(cntl.d_bank_acc)))
 
-  val dataA = VecInit(dataA_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.a_unpadded_cols, d, inputType.zero)})
-  val dataB = VecInit(dataB_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.b_unpadded_cols, d, inputType.zero)})
-  val dataD = VecInit(dataD_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.d_unpadded_cols, d, inputType.zero)})
+  val dataA = VecInit(dataA_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.a_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(0), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayInputType)))
+  val dataB = VecInit(dataB_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.b_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(1), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayWeightType)))
+  val dataD = VecInit(dataD_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.d_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(1), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayWeightType)))
 
   // Pop responses off the scratchpad io ports
   when (mesh_cntl_signals_q.io.deq.fire) {
diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala
index 573581ec..53cc42e6 100644
--- a/src/main/scala/gemmini/GemminiConfigs.scala
+++ b/src/main/scala/gemmini/GemminiConfigs.scala
@@ -18,9 +18,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              opcodes: OpcodeSet = OpcodeSet.custom3,
 
                                                                              inputType: T,
-                                                                             spatialArrayOutputType: T,
+                                                                             weightType: T,
                                                                              accType: T,
 
+                                                                             spatialArrayInputType: T,
+                                                                             spatialArrayWeightType: T,
+                                                                             spatialArrayOutputType: T,
+
                                                                              dataflow: Dataflow.Value = Dataflow.BOTH,
 
                                                                              tileRows: Int = 1,
@@ -89,6 +93,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
                                                                              has_normalizations: Boolean = false,
                                                                              has_first_layer_optimizations: Boolean = true,
 
+                                                                             programmable_datatypes: Boolean = false,
+
                                                                              use_firesim_simulation_counters: Boolean = false,
 
                                                                              use_shared_ext_mem: Boolean = false,
@@ -96,6 +102,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
 
                                                                              headerFileName: String = "gemmini_params.h"
                                                        ) {
+  require(inputType.getWidth == weightType.getWidth)
+
   val sp_width = meshColumns * tileColumns * inputType.getWidth
   val sp_bank_entries = sp_capacity match {
     case CapacityInKilobytes(kb) => kb * 1024 * 8 / (sp_banks * sp_width)
diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala
index 7bca089b..70090f5d 100644
--- a/src/main/scala/gemmini/GemminiISA.scala
+++ b/src/main/scala/gemmini/GemminiISA.scala
@@ -178,7 +178,7 @@ object GemminiISA {
   val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2
   val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1
   val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2
-  val CONFIG_EX_RS1_SPACER0_WIDTH = (7 - 2 - 1 - 2)
+  val CONFIG_EX_RS1_DATATYPE_WIDTH = 2
   val CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH = 1
   val CONFIG_EX_RS1_A_TRANSPOSE_WIDTH = 1
   val CONFIG_EX_RS1_B_TRANSPOSE_WIDTH = 1
@@ -194,7 +194,7 @@ object GemminiISA {
     val b_transpose = UInt(CONFIG_EX_RS1_B_TRANSPOSE_WIDTH.W)
     val a_transpose = UInt(CONFIG_EX_RS1_A_TRANSPOSE_WIDTH.W)
     val set_only_strides = UInt(CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH.W)
-    val _spacer0 = UInt(CONFIG_EX_RS1_SPACER0_WIDTH.W)
+    val datatypes = Vec(CONFIG_EX_RS1_DATATYPE_WIDTH, Bool())
     val activation = UInt(CONFIG_EX_RS1_ACTIVATION_WIDTH.W)
     val dataflow = UInt(CONFIG_EX_RS1_DATAFLOW_WIDTH.W)
     val cmd_type = UInt(CONFIG_EX_RS1_CMD_TYPE_WIDTH.W)
diff --git a/src/main/scala/gemmini/Mesh.scala b/src/main/scala/gemmini/Mesh.scala
index cd056658..03d2ab96 100644
--- a/src/main/scala/gemmini/Mesh.scala
+++ b/src/main/scala/gemmini/Mesh.scala
@@ -14,15 +14,15 @@ import chisel3.experimental._
   * @param meshRows
   * @param meshColumns
   */
-class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
+class Mesh[T <: Data : Arithmetic](inputType: T, weightType: T, outputType: T, accType: T,
                                    df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int,
                                    max_simultaneous_matmuls: Int, output_delay: Int,
                                    val tileRows: Int, val tileColumns: Int,
                                    val meshRows: Int, val meshColumns: Int) extends Module {
   val io = IO(new Bundle {
     val in_a = Input(Vec(meshRows, Vec(tileRows, inputType)))
-    val in_b = Input(Vec(meshColumns, Vec(tileColumns, inputType)))
-    val in_d = Input(Vec(meshColumns, Vec(tileColumns, inputType)))
+    val in_b = Input(Vec(meshColumns, Vec(tileColumns, weightType)))
+    val in_d = Input(Vec(meshColumns, Vec(tileColumns, weightType))) // TODO should this be weightType, inputType, or something like max(inputType, weightType)?
     val in_control = Input(Vec(meshColumns, Vec(tileColumns, new PEControl(accType))))
     val in_id = Input(Vec(meshColumns, Vec(tileColumns, UInt(log2Up(max_simultaneous_matmuls).W)))) // The unique id of this particular matmul
     val in_last = Input(Vec(meshColumns, Vec(tileColumns, Bool())))
@@ -36,7 +36,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T,
   })
 
   // mesh(r)(c) => Tile at row r, column c
-  val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns)))
+  val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, weightType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns)))
   val meshT = mesh.transpose
 
   def pipe[T <: Data](valid: Bool, t: T, latency: Int): T = {
diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala
index 516760bf..b1d52348 100644
--- a/src/main/scala/gemmini/MeshWithDelays.scala
+++ b/src/main/scala/gemmini/MeshWithDelays.scala
@@ -30,16 +30,16 @@ class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](o
 // TODO make all inputs go straight into registers to help with physical design
 
 class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
-  (inputType: T, val outputType: T, accType: T,
+  (val inputType: T, val weightType: T, val outputType: T, accType: T,
    tagType: U, df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, output_delay: Int,
    tileRows: Int, tileColumns: Int, meshRows: Int, meshColumns: Int,
    leftBanks: Int, upBanks: Int, outBanks: Int = 1, n_simultaneous_matmuls: Int = -1)
   extends Module {
 
   val A_TYPE = Vec(meshRows, Vec(tileRows, inputType))
-  val B_TYPE = Vec(meshColumns, Vec(tileColumns, inputType))
+  val B_TYPE = Vec(meshColumns, Vec(tileColumns, weightType)) // TODO should this be weightType, inputType, or something like max(inputType, weightType)?
   val C_TYPE = Vec(meshColumns, Vec(tileColumns, outputType))
-  val D_TYPE = Vec(meshColumns, Vec(tileColumns, inputType))
+  val D_TYPE = Vec(meshColumns, Vec(tileColumns, weightType)) // TODO should this be weightType, inputType, or something like max(inputType, weightType)?
   val S_TYPE = Vec(meshColumns, Vec(tileColumns, new PEControl(accType)))
 
   assert(meshRows*tileRows == meshColumns*tileColumns)
@@ -67,7 +67,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
     val tags_in_progress = Output(Vec(tagqlen, tagType))
   })
 
-  def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false) = {
+  def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false): Seq[Vec[T]] = {
     assert(x.size % banks == 0, "cannot bank without clean divisors")
 
     val banked_len = x.size / banks
@@ -164,7 +164,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data]
   val transposer_out = VecInit(transposer.io.outCol.bits.grouped(tileRows).map(t => VecInit(t)).toSeq)
 
   // Wire up mesh's IO to this module's IO
-  val mesh = Module(new Mesh(inputType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns))
+  val mesh = Module(new Mesh(inputType, weightType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns))
 
   // TODO wire only to *_buf here, instead of io.*.bits
   val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out.asTypeOf(A_TYPE), a_buf))
diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala
index 67dd18ac..a85782b3 100644
--- a/src/main/scala/gemmini/Normalizer.scala
+++ b/src/main/scala/gemmini/Normalizer.scala
@@ -397,7 +397,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
   val exp_divider_out = Wire(Decoupled(scale_t.cloneType))
 
   scale_t match {
-    case Float(expWidth, sigWidth) =>
+    case Float(expWidth, sigWidth, false) =>
 
       exp_divider_in.bits := DontCare
 
diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala
index 9518942f..bb025735 100644
--- a/src/main/scala/gemmini/PE.scala
+++ b/src/main/scala/gemmini/PE.scala
@@ -11,11 +11,11 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle {
 
 }
 
-class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module {
+class MacUnit[T <: Data](inputType: T, weightType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module {
   import ev._
   val io = IO(new Bundle {
     val in_a  = Input(inputType)
-    val in_b  = Input(inputType)
+    val in_b  = Input(weightType)
     val in_c  = Input(cType)
     val out_d = Output(dType)
   })
@@ -28,8 +28,8 @@ class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithme
   * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh.
   * @param width Data width of operands
   */
-class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, max_simultaneous_matmuls: Int)
-                   (implicit ev: Arithmetic[T]) extends Module { // Debugging variables
+class PE[T <: Data](inputType: T, weightType: T, outputType: T, accType: T, df: Dataflow.Value,
+                    max_simultaneous_matmuls: Int)(implicit ev: Arithmetic[T]) extends Module {
   import ev._
 
   val io = IO(new Bundle {
@@ -61,7 +61,7 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
   // elaboration/synthesis tools often fail to consolidate and de-duplicate
   // MAC units. To force mac circuitry to be re-used, we create a "mac_unit"
   // module here which just performs a single MAC operation
-  val mac_unit = Module(new MacUnit(inputType, cType, outputType))
+  val mac_unit = Module(new MacUnit(inputType, weightType, cType, outputType))
 
   val a  = io.in_a
   val b  = io.in_b
@@ -102,14 +102,14 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
     when(prop === PROPAGATE) {
       io.out_c := (c1 >> shift_offset).clippedToWidthOf(outputType)
       io.out_b := b
-      mac_unit.io.in_b := b.asTypeOf(inputType)
+      mac_unit.io.in_b := b.asTypeOf(weightType)
       mac_unit.io.in_c := c2
       c2 := mac_unit.io.out_d
       c1 := d.withWidthOf(cType)
     }.otherwise {
       io.out_c := (c2 >> shift_offset).clippedToWidthOf(outputType)
       io.out_b := b
-      mac_unit.io.in_b := b.asTypeOf(inputType)
+      mac_unit.io.in_b := b.asTypeOf(weightType)
       mac_unit.io.in_c := c1
       c1 := mac_unit.io.out_d
       c2 := d.withWidthOf(cType)
@@ -117,13 +117,13 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
   }.elsewhen ((df == Dataflow.WS).B || ((df == Dataflow.BOTH).B && dataflow === WEIGHT_STATIONARY)) {
     when(prop === PROPAGATE) {
       io.out_c := c1
-      mac_unit.io.in_b := c2.asTypeOf(inputType)
+      mac_unit.io.in_b := c2.asTypeOf(weightType)
       mac_unit.io.in_c := b
       io.out_b := mac_unit.io.out_d
       c1 := d
     }.otherwise {
       io.out_c := c2
-      mac_unit.io.in_b := c1.asTypeOf(inputType)
+      mac_unit.io.in_b := c1.asTypeOf(weightType)
       mac_unit.io.in_c := b
       io.out_b := mac_unit.io.out_d
       c2 := d
@@ -133,7 +133,7 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value,
     //assert(false.B, "unknown dataflow")
     io.out_c := DontCare
     io.out_b := DontCare
-    mac_unit.io.in_b := b.asTypeOf(inputType)
+    mac_unit.io.in_b := b.asTypeOf(weightType)
     mac_unit.io.in_c := c2
   }
 
diff --git a/src/main/scala/gemmini/Tile.scala b/src/main/scala/gemmini/Tile.scala
index 9c2a418c..ead25c74 100644
--- a/src/main/scala/gemmini/Tile.scala
+++ b/src/main/scala/gemmini/Tile.scala
@@ -13,7 +13,8 @@ import Util._
   * @param rows Number of PEs on each row
   * @param columns Number of PEs on each column
   */
-class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module {
+class Tile[T <: Data](inputType: T, weightType: T, outputType: T, accType: T, df: Dataflow.Value,
+                      tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module {
   val io = IO(new Bundle {
     val in_a        = Input(Vec(rows, inputType))
     val in_b        = Input(Vec(columns, outputType)) // This is the output of the tile next to it
@@ -39,7 +40,7 @@ class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Valu
 
   import ev._
 
-  val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, max_simultaneous_matmuls)))
+  val tile = Seq.fill(rows, columns)(Module(new PE(inputType, weightType, outputType, accType, df, max_simultaneous_matmuls)))
   val tileT = tile.transpose
 
   // TODO: abstract hori/vert broadcast, all these connections look the same