diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 747b6b18..cd133513 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -bcbe3b7f1f40d1c388aca68df498fd7dd4d16e89 +f86707bc95d7e95828e63d70ff28fbdaa76a884e diff --git a/README.md b/README.md index 4caf35bb..3e41d6a5 100644 --- a/README.md +++ b/README.md @@ -24,41 +24,26 @@ Dependencies Before beginning, install the [Chipyard dependencies](https://chipyard.readthedocs.io/en/latest/Chipyard-Basics/Initial-Repo-Setup.html#default-requirements-installation). -Installing Chipyard and Spike +Installing Gemmini ----------------------------- -Run these steps to install Chipyard and Spike (make sure to checkout the correct Chipyard and Spike commits as shown below): +Run these steps: ```shell git clone https://github.com/ucb-bar/chipyard.git cd chipyard -git checkout 1.8.1 +git checkout gemmini-fp8-exploration-2 ./build-setup.sh riscv-tools source env.sh cd generators/gemmini git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" -git checkout dev && git pull origin dev +git checkout fp8-recoding && git pull origin fp8-recoding git submodule update --init --recursive make -C software/libgemmini install -# The final step is only necessary if you want to run MIDAS simulations with -# realistic DRAM models -cd - -cd sims/firesim -source sourceme-f1-manager.sh --skip-ssh-setup # Ignore error messages from this command -./build-setup.sh --library --skip-validate -``` - -Setting Up Gemmini ------------------- - -Run the steps below to set up Gemmini configuration files, symlinks, and subdirectories: - -```shell -cd chipyard/generators/gemmini ./scripts/setup-paths.sh ``` diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index bc3cabb4..f9fd1c05 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit bc3cabb4081e3dbaa1d33119f0471357f2aa3927 +Subproject commit f9fd1c05640d561bc47a192a6aaccd503b5b0705 diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala index c6792578..c8f75387 100644 --- a/src/main/scala/gemmini/Arithmetic.scala +++ b/src/main/scala/gemmini/Arithmetic.scala @@ -8,8 +8,8 @@ import chisel3.util._ import hardfloat._ // Bundles that represent the raw bits of custom datatypes -case class Float(expWidth: Int, sigWidth: Int) extends Bundle { - val bits = UInt((expWidth + sigWidth).W) +case class Float(expWidth: Int, sigWidth: Int, isRecoded: Boolean = false) extends Bundle { + val bits = UInt((expWidth + sigWidth + (if (isRecoded) 1 else 0)).W) val bias: Int = (1 << (expWidth-1)) - 1 } @@ -245,7 +245,7 @@ object Arithmetic { } override def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match { - case Float(expWidth, sigWidth) => + case Float(expWidth, sigWidth, false) => val input = Wire(Decoupled(UInt(0.W))) val output = Wire(Decoupled(u.cloneType)) @@ -287,7 +287,7 @@ object Arithmetic { } override def mult_with_reciprocal[U <: Data](reciprocal: U): SInt = reciprocal match { - case recip @ Float(expWidth, sigWidth) => + case recip @ Float(expWidth, sigWidth, false) => def in_to_float(x: SInt) = { val in_to_rec_fn = Module(new INToRecFN(intWidth = self.getWidth, expWidth, sigWidth)) in_to_rec_fn.io.signedIn := true.B @@ -328,12 +328,10 @@ object Arithmetic { } implicit object FloatArithmetic extends Arithmetic[Float] { - // TODO Floating point arithmetic currently switches between recoded and standard formats for every operation. However, it should stay in the recoded format as it travels through the systolic array - override implicit def cast(self: Float): ArithmeticOps[Float] = new ArithmeticOps(self) { override def *(t: Float): Float = { - val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits) - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) val t_resizer = Module(new RecFNToRecFN(t.expWidth, t.sigWidth, self.expWidth, self.sigWidth)) t_resizer.io.in := t_rec @@ -351,16 +349,16 @@ object Arithmetic { muladder.io.b := t_rec_resized muladder.io.c := 0.U - val out = Wire(Float(self.expWidth, self.sigWidth)) - out.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) + val out = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded)) + out.bits := (if (out.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)) out } override def mac(m1: Float, m2: Float): Float = { // Recode all operands - val m1_rec = recFNFromFN(m1.expWidth, m1.sigWidth, m1.bits) - val m2_rec = recFNFromFN(m2.expWidth, m2.sigWidth, m2.bits) - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val m1_rec = if (m1.isRecoded) m1.bits else recFNFromFN(m1.expWidth, m1.sigWidth, m1.bits) + val m2_rec = if (m2.isRecoded) m2.bits else recFNFromFN(m2.expWidth, m2.sigWidth, m2.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) // Resize m1 to self's width val m1_resizer = Module(new RecFNToRecFN(m1.expWidth, m1.sigWidth, self.expWidth, self.sigWidth)) @@ -388,8 +386,8 @@ object Arithmetic { muladder.io.c := self_rec // Convert result to standard format // TODO remove these intermediate recodings - val out = Wire(Float(self.expWidth, self.sigWidth)) - out.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) + val out = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded)) + out.bits := (if (out.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)) out } @@ -397,8 +395,8 @@ object Arithmetic { require(self.getWidth >= t.getWidth) // This just makes it easier to write the resizing code // Recode all operands - val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits) - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) // Generate 1 as a float val in_to_rec_fn = Module(new INToRecFN(1, self.expWidth, self.sigWidth)) @@ -427,8 +425,8 @@ object Arithmetic { muladder.io.b := one_rec muladder.io.c := self_rec - val result = Wire(Float(self.expWidth, self.sigWidth)) - result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) + val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded)) + result.bits := (if (result.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)) result } @@ -440,7 +438,7 @@ object Arithmetic { override def >>(u: UInt): Float = { // Recode self - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) // Get 2^(-u) as a recoded float val shift_exp = Wire(UInt(self.expWidth.W)) @@ -461,15 +459,15 @@ object Arithmetic { muladder.io.b := shift_rec muladder.io.c := 0.U - val result = Wire(Float(self.expWidth, self.sigWidth)) - result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) + val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded)) + result.bits := (if (result.isRecoded) muladder.io.out else fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out)) result } override def >(t: Float): Bool = { // Recode all operands - val t_rec = recFNFromFN(t.expWidth, t.sigWidth, t.bits) - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val t_rec = if (t.isRecoded) t.bits else recFNFromFN(t.expWidth, t.sigWidth, t.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) // Resize t to self's width val t_resizer = Module(new RecFNToRecFN(t.expWidth, t.sigWidth, self.expWidth, self.sigWidth)) @@ -487,43 +485,49 @@ object Arithmetic { } override def withWidthOf(t: Float): Float = { - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) val resizer = Module(new RecFNToRecFN(self.expWidth, self.sigWidth, t.expWidth, t.sigWidth)) resizer.io.in := self_rec resizer.io.roundingMode := consts.round_near_even // consts.round_near_maxMag resizer.io.detectTininess := consts.tininess_afterRounding - val result = Wire(Float(t.expWidth, t.sigWidth)) - result.bits := fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out) + val result = Wire(Float(t.expWidth, t.sigWidth, t.isRecoded)) + result.bits := (if (result.isRecoded) resizer.io.out else fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out)) result } override def clippedToWidthOf(t: Float): Float = { // TODO check for overflow. Right now, we just assume that overflow doesn't happen - val self_rec = recFNFromFN(self.expWidth, self.sigWidth, self.bits) + val self_rec = if (self.isRecoded) self.bits else recFNFromFN(self.expWidth, self.sigWidth, self.bits) val resizer = Module(new RecFNToRecFN(self.expWidth, self.sigWidth, t.expWidth, t.sigWidth)) resizer.io.in := self_rec resizer.io.roundingMode := consts.round_near_even // consts.round_near_maxMag resizer.io.detectTininess := consts.tininess_afterRounding - val result = Wire(Float(t.expWidth, t.sigWidth)) - result.bits := fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out) + val result = Wire(Float(t.expWidth, t.sigWidth, t.isRecoded)) + result.bits := (if (result.isRecoded) resizer.io.out else fNFromRecFN(t.expWidth, t.sigWidth, resizer.io.out)) result } override def relu: Float = { - val raw = rawFloatFromFN(self.expWidth, self.sigWidth, self.bits) + val raw = if (self.isRecoded) rawFloatFromRecFN(self.expWidth, self.sigWidth, self.bits) else rawFloatFromFN(self.expWidth, self.sigWidth, self.bits) - val result = Wire(Float(self.expWidth, self.sigWidth)) + val result = Wire(Float(self.expWidth, self.sigWidth, self.isRecoded)) result.bits := Mux(!raw.isZero && raw.sign, 0.U, self.bits) result } override def zero: Float = 0.U.asTypeOf(self) - override def identity: Float = Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) - override def minimum: Float = Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) + override def identity: Float = { + require(!self.isRecoded) + Cat(0.U(2.W), ~(0.U((self.expWidth-1).W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) + } + override def minimum: Float = { + require(!self.isRecoded) + Cat(1.U, ~(0.U(self.expWidth.W)), 0.U((self.sigWidth-1).W)).asTypeOf(self) + } } } diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index bd84b317..92155fb7 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -10,7 +10,6 @@ import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ import freechips.rocketchip.system._ import freechips.rocketchip.diplomacy._ - import gemmini.Arithmetic.SIntArithmetic import hardfloat._ @@ -22,8 +21,11 @@ object GemminiConfigs { val defaultConfig = GemminiArrayConfig[SInt, Float, Float]( // Datatypes inputType = SInt(8.W), + weightType = SInt(8.W), accType = SInt(32.W), + spatialArrayInputType = SInt(8.W), + spatialArrayWeightType = SInt(8.W), spatialArrayOutputType = SInt(20.W), // Spatial array size options @@ -166,7 +168,10 @@ object GemminiConfigs { val dummyConfig = GemminiArrayConfig[DummySInt, Float, Float]( inputType = DummySInt(8), + weightType = DummySInt(8), accType = DummySInt(32), + spatialArrayInputType = DummySInt(8), + spatialArrayWeightType = DummySInt(8), spatialArrayOutputType = DummySInt(20), tileRows = defaultConfig.tileRows, tileColumns = defaultConfig.tileColumns, diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index c76907dd..ef5ce85d 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -1,3 +1,4 @@ + package gemmini import chisel3._ @@ -48,10 +49,15 @@ object GemminiFPConfigs { max_in_flight_mem_reqs = 16, use_dedicated_tl_port = false, use_shared_ext_mem = false, + inputType = Float(8, 24), - spatialArrayOutputType = Float(8, 24), + weightType = Float(8, 24), accType = Float(8, 24), + spatialArrayInputType = Float(8, 24), + spatialArrayWeightType = Float(8, 24), + spatialArrayOutputType = Float(8, 24), + mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_shared = false, @@ -81,34 +87,67 @@ object GemminiFPConfigs { ) //FP32 Single Precision Configuration - val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), spatialArrayOutputType = Float(8, 24), accType = Float(8, 24), + val FP32DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 24), weightType = Float(8, 24), accType = Float(8, 24), + spatialArrayInputType = Float(8, 24), spatialArrayWeightType = Float(8, 24), spatialArrayOutputType = Float(8, 24), tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) //FP16 Half Precision Configuration - val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), spatialArrayOutputType = Float(5, 11), accType = Float(8, 24), + val FP16DefaultConfig = defaultFPConfig.copy(inputType = Float(5, 11), weightType = Float(5, 11), accType = Float(8, 24), + spatialArrayInputType = Float(5, 11), spatialArrayWeightType = Float(5, 11), spatialArrayOutputType = Float(5, 11), tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(5, 11), -1, identity = "1.0", c_str="((x) * (scale))")), ) //Bfloat16 Brain-half Precision Configuration - val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), + val BF16DefaultConfig = defaultFPConfig.copy(inputType = Float(8, 8), weightType = Float(8, 8), accType = Float(8, 24), + spatialArrayInputType = Float(8, 8), spatialArrayWeightType = Float(8, 8), spatialArrayOutputType = Float(8, 8), tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) //Bfloat16 Brain-half Precision Configuration 8x8 array - val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), spatialArrayOutputType = Float(8, 8), accType = Float(8, 24), + val BF16Default8Config = defaultFPConfig.copy(inputType = Float(8, 8), weightType = Float(8, 8), accType = Float(8, 24), + spatialArrayInputType = Float(8, 8), spatialArrayWeightType = Float(8, 8), spatialArrayOutputType = Float(8, 8), meshRows = 8, meshColumns = 8, tile_latency = 2, mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) + val FP8TransformerConfig = defaultFPConfig.copy( + inputType = Float(4, 4), weightType = Float(4, 4), accType = Float(8, 24), + + spatialArrayInputType = Float(4, 4, isRecoded = true), spatialArrayWeightType = Float(4, 4, isRecoded = true), spatialArrayOutputType = Float(5, 11, isRecoded = true), + + has_normalizations = false, // TODO this should be true + has_dw_convs = false, has_training_convs = false, has_max_pool = false, has_first_layer_optimizations = false, has_nonlinear_activations = false, + + dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true, + + meshRows = 16, meshColumns = 16, + acc_capacity = CapacityInKilobytes(128), + + tile_latency = 2, + ) + + val FP8TransformerWithoutDenormalRecodingsConfig = FP8TransformerConfig.copy( + spatialArrayInputType = Float(4, 4, isRecoded = false), spatialArrayWeightType = Float(4, 4, isRecoded = false), spatialArrayOutputType = Float(5, 11, isRecoded = false), + ) + + val HybridFP8TransformerConfig = FP8TransformerConfig.copy( + weightType = Float(5, 3), + spatialArrayInputType = Float(5, 4, isRecoded = true), spatialArrayWeightType = Float(5, 4, isRecoded = true), + programmable_datatypes = true + ) + + val NonProgrammableHybridFP8TransformerConfig = HybridFP8TransformerConfig.copy( + programmable_datatypes = false + ) } @@ -171,3 +210,43 @@ class GemminiBF16Default8Config extends Config((site, here, up) => { ) }) +// FP8 Transformer configs +class GemminiFP8TransformerConfig extends Config((site, here, up) => { + case BuildRoCC => Seq( + (p: Parameters) => { + implicit val q = p + implicit val v = implicitly[ValName] + LazyModule(new Gemmini(GemminiFPConfigs.FP8TransformerConfig)) + } + ) +}) + +class GemminiFP8TransformerNoRecodingConfig extends Config((site, here, up) => { + case BuildRoCC => Seq( + (p: Parameters) => { + implicit val q = p + implicit val v = implicitly[ValName] + LazyModule(new Gemmini(GemminiFPConfigs.FP8TransformerWithoutDenormalRecodingsConfig)) + } + ) +}) + +class GemminiHybridFP8TransformerConfig extends Config((site, here, up) => { + case BuildRoCC => Seq( + (p: Parameters) => { + implicit val q = p + implicit val v = implicitly[ValName] + LazyModule(new Gemmini(GemminiFPConfigs.HybridFP8TransformerConfig)) + } + ) +}) + +class GemminiNonProgrammableHybridFP8TransformerConfig extends Config((site, here, up) => { + case BuildRoCC => Seq( + (p: Parameters) => { + implicit val q = p + implicit val v = implicitly[ValName] + LazyModule(new Gemmini(GemminiFPConfigs.NonProgrammableHybridFP8TransformerConfig)) + } + ) +}) diff --git a/src/main/scala/gemmini/CustomConfigs.scala b/src/main/scala/gemmini/CustomConfigs.scala index ae529a69..4ff6e47e 100644 --- a/src/main/scala/gemmini/CustomConfigs.scala +++ b/src/main/scala/gemmini/CustomConfigs.scala @@ -1,3 +1,4 @@ + package gemmini import chipsalliance.rocketchip.config.{Config, Parameters} @@ -42,15 +43,22 @@ object GemminiCustomConfigs { ) val ibertInferenceConfig = defaultConfig.copy( - has_training_convs = false, - has_max_pool = false, + // TODO change these back +// has_training_convs = false, +// has_max_pool = false, +// has_normalizations = true, +// +// acc_capacity = CapacityInKilobytes(128), + has_normalizations = true, + has_dw_convs = false, has_training_convs = false, has_max_pool = false, has_first_layer_optimizations = false, has_nonlinear_activations = false, + dataflow=Dataflow.WS, max_in_flight_mem_reqs = 64, acc_read_full_width = false, ex_read_from_acc = false, ex_write_to_spad = false, hardcode_d_to_garbage_addr = true, acc_capacity = CapacityInKilobytes(128), ) // Specify which of your custom configs you want to build here - val customConfig = baselineInferenceConfig + val customConfig = ibertInferenceConfig } diff --git a/src/main/scala/gemmini/DSEConfigs.scala b/src/main/scala/gemmini/DSEConfigs.scala index 257721ca..7017ff8a 100644 --- a/src/main/scala/gemmini/DSEConfigs.scala +++ b/src/main/scala/gemmini/DSEConfigs.scala @@ -25,8 +25,7 @@ object DSEBaseConfig { reservation_station_entries_st = 4, reservation_station_entries_ex = 16, - - sp_banks = 4, // TODO support one-bank designs + sp_banks = 4, // TODO support one-bank designs acc_banks = 1, acc_singleported = false, acc_latency = 2, @@ -39,9 +38,15 @@ object DSEBaseConfig { dma_maxbytes = 128, // TODO get this from cacheblockbytes dma_buswidth = 128, // TODO get this from SystemBusKey aligned_to = 16, + inputType = SInt(8.W), - spatialArrayOutputType = SInt(19.W), + weightType = SInt(8.W), accType = SInt(32.W), + + spatialArrayInputType = SInt(8.W), + spatialArrayWeightType = SInt(8.W), + spatialArrayOutputType = SInt(19.W), + mvin_scale_args = None, mvin_scale_acc_args = None, mvin_scale_shared = false, diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 65add720..bd36103d 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -118,6 +118,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val activation = if (has_nonlinear_activations) Reg(UInt(Activation.bitwidth.W)) else Activation.NONE // TODO magic number val a_transpose = Reg(Bool()) val bd_transpose = Reg(Bool()) + val datatypes = if (programmable_datatypes) Reg(Vec(2, Bool())) else VecInit(false.B, true.B) val config_initialized = RegInit(false.B) val a_should_be_fed_into_transposer = Mux(current_dataflow === Dataflow.OS.id.U, !a_transpose, a_transpose) @@ -183,7 +184,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val cntl = mesh_cntl_signals_q.io.deq.bits // Instantiate the actual mesh - val mesh = Module(new MeshWithDelays(inputType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay, + val mesh = Module(new MeshWithDelays(spatialArrayInputType, spatialArrayWeightType, spatialArrayOutputType, accType, mesh_tag, dataflow, tree_reduction, tile_latency, mesh_output_delay, tileRows, tileColumns, meshRows, meshColumns, shifter_banks, shifter_banks)) mesh.io.a.valid := false.B @@ -556,6 +557,10 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In a_transpose := config_ex_rs1.a_transpose bd_transpose := config_ex_rs1.b_transpose + if (programmable_datatypes) { + datatypes := config_ex_rs1.datatypes + } + if (dataflow == Dataflow.BOTH) { current_dataflow := config_ex_rs1.dataflow } @@ -833,9 +838,9 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val dataB_unpadded = MuxCase(readData(cntl.b_bank), Seq(cntl.accumulate_zeros -> 0.U, cntl.b_read_from_acc -> accReadData(cntl.b_bank_acc))) val dataD_unpadded = MuxCase(readData(cntl.d_bank), Seq(cntl.preload_zeros -> 0.U, cntl.d_read_from_acc -> accReadData(cntl.d_bank_acc))) - val dataA = VecInit(dataA_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.a_unpadded_cols, d, inputType.zero)}) - val dataB = VecInit(dataB_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.b_unpadded_cols, d, inputType.zero)}) - val dataD = VecInit(dataD_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.d_unpadded_cols, d, inputType.zero)}) + val dataA = VecInit(dataA_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.a_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(0), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayInputType))) + val dataB = VecInit(dataB_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.b_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(1), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayWeightType))) + val dataD = VecInit(dataD_unpadded.asTypeOf(Vec(block_size, inputType)).zipWithIndex.map { case (d, i) => Mux(i.U < cntl.d_unpadded_cols, d, inputType.zero)}.map(d => Mux(datatypes(1), d.asTypeOf(inputType), d.asTypeOf(weightType)).withWidthOf(spatialArrayWeightType))) // Pop responses off the scratchpad io ports when (mesh_cntl_signals_q.io.deq.fire) { diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 573581ec..53cc42e6 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -18,9 +18,13 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( opcodes: OpcodeSet = OpcodeSet.custom3, inputType: T, - spatialArrayOutputType: T, + weightType: T, accType: T, + spatialArrayInputType: T, + spatialArrayWeightType: T, + spatialArrayOutputType: T, + dataflow: Dataflow.Value = Dataflow.BOTH, tileRows: Int = 1, @@ -89,6 +93,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( has_normalizations: Boolean = false, has_first_layer_optimizations: Boolean = true, + programmable_datatypes: Boolean = false, + use_firesim_simulation_counters: Boolean = false, use_shared_ext_mem: Boolean = false, @@ -96,6 +102,8 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( headerFileName: String = "gemmini_params.h" ) { + require(inputType.getWidth == weightType.getWidth) + val sp_width = meshColumns * tileColumns * inputType.getWidth val sp_bank_entries = sp_capacity match { case CapacityInKilobytes(kb) => kb * 1024 * 8 / (sp_banks * sp_width) diff --git a/src/main/scala/gemmini/GemminiISA.scala b/src/main/scala/gemmini/GemminiISA.scala index 7bca089b..70090f5d 100644 --- a/src/main/scala/gemmini/GemminiISA.scala +++ b/src/main/scala/gemmini/GemminiISA.scala @@ -178,7 +178,7 @@ object GemminiISA { val CONFIG_EX_RS1_CMD_TYPE_WIDTH = 2 val CONFIG_EX_RS1_DATAFLOW_WIDTH = 1 val CONFIG_EX_RS1_ACTIVATION_WIDTH = 2 - val CONFIG_EX_RS1_SPACER0_WIDTH = (7 - 2 - 1 - 2) + val CONFIG_EX_RS1_DATATYPE_WIDTH = 2 val CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH = 1 val CONFIG_EX_RS1_A_TRANSPOSE_WIDTH = 1 val CONFIG_EX_RS1_B_TRANSPOSE_WIDTH = 1 @@ -194,7 +194,7 @@ object GemminiISA { val b_transpose = UInt(CONFIG_EX_RS1_B_TRANSPOSE_WIDTH.W) val a_transpose = UInt(CONFIG_EX_RS1_A_TRANSPOSE_WIDTH.W) val set_only_strides = UInt(CONFIG_EX_RS1_SET_ONLY_STRIDES_WIDTH.W) - val _spacer0 = UInt(CONFIG_EX_RS1_SPACER0_WIDTH.W) + val datatypes = Vec(CONFIG_EX_RS1_DATATYPE_WIDTH, Bool()) val activation = UInt(CONFIG_EX_RS1_ACTIVATION_WIDTH.W) val dataflow = UInt(CONFIG_EX_RS1_DATAFLOW_WIDTH.W) val cmd_type = UInt(CONFIG_EX_RS1_CMD_TYPE_WIDTH.W) diff --git a/src/main/scala/gemmini/Mesh.scala b/src/main/scala/gemmini/Mesh.scala index cd056658..03d2ab96 100644 --- a/src/main/scala/gemmini/Mesh.scala +++ b/src/main/scala/gemmini/Mesh.scala @@ -14,15 +14,15 @@ import chisel3.experimental._ * @param meshRows * @param meshColumns */ -class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, +class Mesh[T <: Data : Arithmetic](inputType: T, weightType: T, outputType: T, accType: T, df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, max_simultaneous_matmuls: Int, output_delay: Int, val tileRows: Int, val tileColumns: Int, val meshRows: Int, val meshColumns: Int) extends Module { val io = IO(new Bundle { val in_a = Input(Vec(meshRows, Vec(tileRows, inputType))) - val in_b = Input(Vec(meshColumns, Vec(tileColumns, inputType))) - val in_d = Input(Vec(meshColumns, Vec(tileColumns, inputType))) + val in_b = Input(Vec(meshColumns, Vec(tileColumns, weightType))) + val in_d = Input(Vec(meshColumns, Vec(tileColumns, weightType))) // TODO should this be weightType, inputType, or something like max(inputType, weightType)? val in_control = Input(Vec(meshColumns, Vec(tileColumns, new PEControl(accType)))) val in_id = Input(Vec(meshColumns, Vec(tileColumns, UInt(log2Up(max_simultaneous_matmuls).W)))) // The unique id of this particular matmul val in_last = Input(Vec(meshColumns, Vec(tileColumns, Bool()))) @@ -36,7 +36,7 @@ class Mesh[T <: Data : Arithmetic](inputType: T, outputType: T, accType: T, }) // mesh(r)(c) => Tile at row r, column c - val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns))) + val mesh: Seq[Seq[Tile[T]]] = Seq.fill(meshRows, meshColumns)(Module(new Tile(inputType, weightType, outputType, accType, df, tree_reduction, max_simultaneous_matmuls, tileRows, tileColumns))) val meshT = mesh.transpose def pipe[T <: Data](valid: Bool, t: T, latency: Int): T = { diff --git a/src/main/scala/gemmini/MeshWithDelays.scala b/src/main/scala/gemmini/MeshWithDelays.scala index 516760bf..b1d52348 100644 --- a/src/main/scala/gemmini/MeshWithDelays.scala +++ b/src/main/scala/gemmini/MeshWithDelays.scala @@ -30,16 +30,16 @@ class MeshWithDelaysResp[T <: Data: Arithmetic, TagT <: TagQueueTag with Data](o // TODO make all inputs go straight into registers to help with physical design class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] - (inputType: T, val outputType: T, accType: T, + (val inputType: T, val weightType: T, val outputType: T, accType: T, tagType: U, df: Dataflow.Value, tree_reduction: Boolean, tile_latency: Int, output_delay: Int, tileRows: Int, tileColumns: Int, meshRows: Int, meshColumns: Int, leftBanks: Int, upBanks: Int, outBanks: Int = 1, n_simultaneous_matmuls: Int = -1) extends Module { val A_TYPE = Vec(meshRows, Vec(tileRows, inputType)) - val B_TYPE = Vec(meshColumns, Vec(tileColumns, inputType)) + val B_TYPE = Vec(meshColumns, Vec(tileColumns, weightType)) // TODO should this be weightType, inputType, or something like max(inputType, weightType)? val C_TYPE = Vec(meshColumns, Vec(tileColumns, outputType)) - val D_TYPE = Vec(meshColumns, Vec(tileColumns, inputType)) + val D_TYPE = Vec(meshColumns, Vec(tileColumns, weightType)) // TODO should this be weightType, inputType, or something like max(inputType, weightType)? val S_TYPE = Vec(meshColumns, Vec(tileColumns, new PEControl(accType))) assert(meshRows*tileRows == meshColumns*tileColumns) @@ -67,7 +67,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] val tags_in_progress = Output(Vec(tagqlen, tagType)) }) - def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false) = { + def shifted[T <: Data](x: Vec[Vec[T]], banks: Int, reverse: Boolean = false): Seq[Vec[T]] = { assert(x.size % banks == 0, "cannot bank without clean divisors") val banked_len = x.size / banks @@ -164,7 +164,7 @@ class MeshWithDelays[T <: Data: Arithmetic, U <: TagQueueTag with Data] val transposer_out = VecInit(transposer.io.outCol.bits.grouped(tileRows).map(t => VecInit(t)).toSeq) // Wire up mesh's IO to this module's IO - val mesh = Module(new Mesh(inputType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns)) + val mesh = Module(new Mesh(inputType, weightType, outputType, accType, df, tree_reduction, tile_latency, max_simultaneous_matmuls, output_delay, tileRows, tileColumns, meshRows, meshColumns)) // TODO wire only to *_buf here, instead of io.*.bits val a_shifter_in = WireInit(Mux(a_is_from_transposer, transposer_out.asTypeOf(A_TYPE), a_buf)) diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index 67dd18ac..a85782b3 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -397,7 +397,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val exp_divider_out = Wire(Decoupled(scale_t.cloneType)) scale_t match { - case Float(expWidth, sigWidth) => + case Float(expWidth, sigWidth, false) => exp_divider_in.bits := DontCare diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala index 9518942f..bb025735 100644 --- a/src/main/scala/gemmini/PE.scala +++ b/src/main/scala/gemmini/PE.scala @@ -11,11 +11,11 @@ class PEControl[T <: Data : Arithmetic](accType: T) extends Bundle { } -class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module { +class MacUnit[T <: Data](inputType: T, weightType: T, cType: T, dType: T) (implicit ev: Arithmetic[T]) extends Module { import ev._ val io = IO(new Bundle { val in_a = Input(inputType) - val in_b = Input(inputType) + val in_b = Input(weightType) val in_c = Input(cType) val out_d = Output(dType) }) @@ -28,8 +28,8 @@ class MacUnit[T <: Data](inputType: T, cType: T, dType: T) (implicit ev: Arithme * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh. * @param width Data width of operands */ -class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, max_simultaneous_matmuls: Int) - (implicit ev: Arithmetic[T]) extends Module { // Debugging variables +class PE[T <: Data](inputType: T, weightType: T, outputType: T, accType: T, df: Dataflow.Value, + max_simultaneous_matmuls: Int)(implicit ev: Arithmetic[T]) extends Module { import ev._ val io = IO(new Bundle { @@ -61,7 +61,7 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, // elaboration/synthesis tools often fail to consolidate and de-duplicate // MAC units. To force mac circuitry to be re-used, we create a "mac_unit" // module here which just performs a single MAC operation - val mac_unit = Module(new MacUnit(inputType, cType, outputType)) + val mac_unit = Module(new MacUnit(inputType, weightType, cType, outputType)) val a = io.in_a val b = io.in_b @@ -102,14 +102,14 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, when(prop === PROPAGATE) { io.out_c := (c1 >> shift_offset).clippedToWidthOf(outputType) io.out_b := b - mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_b := b.asTypeOf(weightType) mac_unit.io.in_c := c2 c2 := mac_unit.io.out_d c1 := d.withWidthOf(cType) }.otherwise { io.out_c := (c2 >> shift_offset).clippedToWidthOf(outputType) io.out_b := b - mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_b := b.asTypeOf(weightType) mac_unit.io.in_c := c1 c1 := mac_unit.io.out_d c2 := d.withWidthOf(cType) @@ -117,13 +117,13 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, }.elsewhen ((df == Dataflow.WS).B || ((df == Dataflow.BOTH).B && dataflow === WEIGHT_STATIONARY)) { when(prop === PROPAGATE) { io.out_c := c1 - mac_unit.io.in_b := c2.asTypeOf(inputType) + mac_unit.io.in_b := c2.asTypeOf(weightType) mac_unit.io.in_c := b io.out_b := mac_unit.io.out_d c1 := d }.otherwise { io.out_c := c2 - mac_unit.io.in_b := c1.asTypeOf(inputType) + mac_unit.io.in_b := c1.asTypeOf(weightType) mac_unit.io.in_c := b io.out_b := mac_unit.io.out_d c2 := d @@ -133,7 +133,7 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, //assert(false.B, "unknown dataflow") io.out_c := DontCare io.out_b := DontCare - mac_unit.io.in_b := b.asTypeOf(inputType) + mac_unit.io.in_b := b.asTypeOf(weightType) mac_unit.io.in_c := c2 } diff --git a/src/main/scala/gemmini/Tile.scala b/src/main/scala/gemmini/Tile.scala index 9c2a418c..ead25c74 100644 --- a/src/main/scala/gemmini/Tile.scala +++ b/src/main/scala/gemmini/Tile.scala @@ -13,7 +13,8 @@ import Util._ * @param rows Number of PEs on each row * @param columns Number of PEs on each column */ -class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module { +class Tile[T <: Data](inputType: T, weightType: T, outputType: T, accType: T, df: Dataflow.Value, + tree_reduction: Boolean, max_simultaneous_matmuls: Int, val rows: Int, val columns: Int)(implicit ev: Arithmetic[T]) extends Module { val io = IO(new Bundle { val in_a = Input(Vec(rows, inputType)) val in_b = Input(Vec(columns, outputType)) // This is the output of the tile next to it @@ -39,7 +40,7 @@ class Tile[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Valu import ev._ - val tile = Seq.fill(rows, columns)(Module(new PE(inputType, outputType, accType, df, max_simultaneous_matmuls))) + val tile = Seq.fill(rows, columns)(Module(new PE(inputType, weightType, outputType, accType, df, max_simultaneous_matmuls))) val tileT = tile.transpose // TODO: abstract hori/vert broadcast, all these connections look the same