diff --git a/Bender.lock b/Bender.lock index 89d566b7e..3b04a2e98 100644 --- a/Bender.lock +++ b/Bender.lock @@ -109,4 +109,4 @@ packages: source: Git: https://github.com/pulp-platform/tech_cells_generic.git dependencies: - - common_verification + - common_verification \ No newline at end of file diff --git a/Bender.yml b/Bender.yml index 180981a8e..b58242816 100644 --- a/Bender.yml +++ b/Bender.yml @@ -73,4 +73,4 @@ sources: - target: fpga files: # Level 1 - - hardware/src/axi_rab_wrap.sv + - hardware/src/axi_rab_wrap.sv \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d510484e..2ff2e1dd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add support for the `FENCE` instruction - Add support for DRAMsys5.0 co-simulation - Add support for atomics in L2 +- Add Dynamic Address Scrambling (DAS) support with configurable partitioning, hardware address scrambler, and software runtime for dynamic heap allocation ### Changes - Add physical feasible TeraPool configuration with SubGroup hierarchy. diff --git a/README.md b/README.md index 003a58779..70f0d8493 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ [![ci](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml) [![lint](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml/badge.svg)](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) - # MemPool MemPool is a many-core system targeting image processing and wireless applications. It implements 256 RISC-V cores that can access a large, shared L1 memory in at most five cycles. TeraPool and MinPool, respectively a 1024 RISC-V cores scaled-up and a 16 RISC-V cores scaled-down parametrizations of MemPool are also supported. diff --git a/config/README.md b/config/README.md index 1aa187773..1e87bb4de 100644 --- a/config/README.md +++ b/config/README.md @@ -34,3 +34,63 @@ you can use the following command to make git pick up tracking the file again: ```bash git update-index --no-assume-unchanged config/config.mk ``` + +## Dynamic Address Scrambling (DAS) + +Dynamic Address Scrambling (DAS) is a runtime-configurable address mapping +technique. DAS remaps contiguous address spaces to physically adjacent memory +banks based on the workload's memory access patterns, placing data physically +close to PEs. + +### Build-time configuration + +DAS is controlled by three variables in `config.mk`: + +| Variable | Default | Description | +|----------------------|---------|--------------------------------------------| +| `das` | `1` | Enable (`1`) or disable (`0`) DAS support | +| `num_das_partitions` | `4` | Number of independent DAS regions | +| `das_mem_size` | `2048` | DAS heap size per core (bytes) | + +### DAS registers + +Each DAS partition `i` (0 .. `num_das_partitions - 1`) is programmed through +three memory-mapped registers: + +| Register | Description | +|----------------|------------------------------------------------------------| +| `tiles_das[i]` | Folding granularity: number of tiles in this DAS partition | +| `start_das[i]` | Allocated start address of this DAS partition | +| `rows_das[i]` | Allocated size of this DAS partition (in rows) | + +The hardware address scrambler uses these registers to remap addresses within +each partition so that consecutive words land on adjacent banks within +`tiles_das[i]` tiles, rather than being interleaved across all tiles. + +### Software usage + +The runtime provides a convenience API to configure DAS partitions. A typical +flow (see `software/apps/baremetal/das_gemm_f32/main.c` for a full example): + +```c +// 1. Initialize the DAS heap allocator +mempool_dynamic_heap_alloc_init(core_id); +alloc_t *das_alloc = get_dynamic_heap_alloc(); + +// 2. Allocate buffers from the DAS heap +float *a = (float *)partition_malloc(das_alloc, a_size); +float *b = (float *)partition_malloc(das_alloc, b_size); + +// 3. Configure DAS partitions +// das_config(partition_id, tiles_per_partition, start_addr, size_bytes) +// - Setting tiles_per_partition = 1 maps the region to a single tile (local). +// - Setting tiles_per_partition = NUM_TILES keeps the default full interleaving. +das_config(0, 1, (uint32_t)a, a_size); // a: local to one tile +das_config(1, NUM_TILES, (uint32_t)b, b_size); // b: fully interleaved + +// 4. Use the buffers normally — the hardware handles address remapping + +// 5. Free when done +partition_free(das_alloc, b); +partition_free(das_alloc, a); +``` diff --git a/config/config.mk b/config/config.mk index 9ea9a0fd0..3a4fd33f8 100644 --- a/config/config.mk +++ b/config/config.mk @@ -73,6 +73,12 @@ zquarterinx ?= 0 # DivSqrt deactivated by default xDivSqrt ?= 0 +# Enable configurable addressing scheme in the heap +das ?= 1 +num_das_partitions ?= 4 +# Size of DAS-heap per core +das_mem_size ?= 2048 + # This parameter is only used for TeraPool configurations num_sub_groups_per_group ?= 1 remote_group_latency_cycles ?= 7 diff --git a/config/terapool.mk b/config/terapool.mk index 0f1c264f8..6bdd329e9 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -45,4 +45,4 @@ dmas_per_group ?= 4 # Brust Length = 16 # L2 Banks/Channels l2_banks = 16 -l2_size ?= 16777216 # 1000000 \ No newline at end of file +l2_size ?= 16777216 # 1000000 diff --git a/hardware/Makefile b/hardware/Makefile index 895d917d1..9f1448a24 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -118,6 +118,8 @@ vlog_defs += -DL2_SIZE=32\'d$(l2_size) vlog_defs += -DL2_BANKS=$(l2_banks) vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size) vlog_defs += -DBOOT_ADDR=32\'d$(boot_addr) +vlog_defs += -DDAS=$(das) +vlog_defs += -DNUM_DAS_PARTITIONS=$(num_das_partitions) # Snitch ISA vlog_defs += -DXPULPIMG=$(xpulpimg) vlog_defs += -DZFINX=$(zfinx) @@ -436,4 +438,4 @@ clean-dasm: rm -rf $(buildpath)/*.dasm clean-trace: - rm -rf $(buildpath)/*.trace + rm -rf $(buildpath)/*.trace \ No newline at end of file diff --git a/hardware/deps/idma/Bender.yml b/hardware/deps/idma/Bender.yml index 0064ee877..0ad4a786d 100644 --- a/hardware/deps/idma/Bender.yml +++ b/hardware/deps/idma/Bender.yml @@ -15,6 +15,7 @@ sources: # levels 1 and 0, etc. Files within a level are ordered alphabetically. # Level 0 - src/axi_dma_data_path.sv + - src/midends/idma_address_scrambler.sv # Level 1 - src/axi_dma_data_mover.sv - src/axi_dma_burst_reshaper.sv @@ -23,6 +24,7 @@ sources: # Level 3: MemPool - src/midends/idma_split_midend.sv - src/midends/idma_distributed_midend.sv + # If enabled DAS - src/frontends/mempool/mempool_dma_frontend_reg_pkg.sv - src/frontends/mempool/mempool_dma_frontend_reg_top.sv - src/frontends/mempool/mempool_dma.sv diff --git a/hardware/deps/idma/src/midends/idma_address_scrambler.sv b/hardware/deps/idma/src/midends/idma_address_scrambler.sv new file mode 100644 index 000000000..b637a903f --- /dev/null +++ b/hardware/deps/idma/src/midends/idma_address_scrambler.sv @@ -0,0 +1,105 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Address scrambler for iDMA Midend, scramble scheme is determined +// by tiles_das +// Current constraints: + +// Author: Bowen Wang +// Author: Marco Bertuletti + +module idma_address_scrambler #( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, + parameter int unsigned NumTiles = 128, + parameter int unsigned NumBanksPerTile = 32, + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles +) ( + input logic [AddrWidth-1:0] address_i, + input logic [31:0] num_bytes_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] tiles_das_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] rows_das_i, + input logic [NumDASPartitions-1:0][DataWidth-1:0] start_das_i, + output logic [$clog2(NumTiles):0] tiles_das_o, + output logic [$clog2(NumTiles):0] rows_das_o, + output logic [AddrWidth-1:0] address_o +); + // Basic Settings + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); + localparam int unsigned TileIdBits = $clog2(NumTiles); + localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; + + if (Bypass || NumTiles < 2) begin + assign address_o = address_i; + end else begin + + // ------ Heap Sequential Signals ------ // + + // `tile_index` : how many bits to shift for TileID bits in each partition + // `row_index`: how many bits need to swap within Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_tile_index ( + .in_i (tiles_das_i[i]), + .cnt_o (tile_index[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log_row_index ( + .in_i (rows_das_i[i][$clog2(NumTiles):0]), + .cnt_o (row_index[i] ), + .empty_o (/* Unused */ ) + ); + end + + always_comb begin + + // Default: Unscrambled + address_o = address_i; + tiles_das_o = '0; + rows_das_o = '0; + + // TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded + if (address_i < DASStartAddr) begin + tiles_das_o = NumTiles; // fully interleaved + rows_das_o = num_bytes_i / MemSizePerRow; + + // DAS address scrambling + end else begin + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) ) begin + address_o = '0; + address_o |= address_i & ((1 << (tile_index[p]+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (row_index[p]+tile_index[p]+ConstantBitsLSB)) << (tile_index[p]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); + address_o |= ((address_i >> (tile_index[p]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + address_o |= address_i & ~((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); + tiles_das_o = tiles_das_i[p]; + rows_das_o = rows_das_i[p]; + end + end + + end + end + + end + + // Check for unsupported configurations + if (NumBanksPerTile < 2) + $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); + +endmodule : idma_address_scrambler diff --git a/hardware/deps/idma/src/midends/idma_distributed_midend.sv b/hardware/deps/idma/src/midends/idma_distributed_midend.sv index e1cd96e10..adff5b758 100644 --- a/hardware/deps/idma/src/midends/idma_distributed_midend.sv +++ b/hardware/deps/idma/src/midends/idma_distributed_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -17,23 +19,31 @@ module idma_distributed_midend #( parameter int unsigned DmaRegionEnd = 32'h1000_0000, /// Number of generic 1D requests that can be buffered parameter int unsigned TransFifoDepth = 1, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumDASPartitions = 4, +`endif /// Arbitrary 1D burst request definition parameter type burst_req_t = logic, /// Meta data response definition parameter type meta_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [$clog2(NumTiles):0] rows_das_i, +`endif // Slave - input burst_req_t burst_req_i, - input logic valid_i, - output logic ready_o, - output meta_t meta_o, + input burst_req_t burst_req_i, + input logic valid_i, + output logic ready_o, + output meta_t meta_o, // Master - output burst_req_t [NoMstPorts-1:0] burst_req_o, - output logic [NoMstPorts-1:0] valid_o, - input logic [NoMstPorts-1:0] ready_i, - input meta_t [NoMstPorts-1:0] meta_i + output burst_req_t [NoMstPorts-1:0] burst_req_o, + output logic [NoMstPorts-1:0] valid_o, + input logic [NoMstPorts-1:0] ready_i, + input meta_t [NoMstPorts-1:0] meta_i ); localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); @@ -57,6 +67,7 @@ module idma_distributed_midend #( // Collect the `trans_complete` signals and reduce them once we have all of them logic empty; logic data; + logic push; fifo_v3 #( .FALL_THROUGH (0 ), .DATA_WIDTH (1 ), @@ -70,12 +81,44 @@ module idma_distributed_midend #( .empty_o (empty ), .usage_o (/*unused*/ ), .data_i (1'b1 ), - .push_i (trans_complete_d[i] ), + .push_i (push ), .data_o (data ), .pop_i (meta_o.trans_complete) ); assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; assign trans_complete_q[i] = data && !empty; + +`ifdef DAS + // Handle two complete signals arrive at the same time + logic [NumDASPartitions-1:0] conflict_counter_d, conflict_counter_q; + `FF(conflict_counter_q, conflict_counter_d, '0, clk_i, rst_ni) + always_comb begin + push = trans_complete_d[i] && !fifo_full[i]; + conflict_counter_d = conflict_counter_q; + // FIFO is not full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && !fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is full + if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+2; + end + if (!meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + if (meta_i[i].trans_complete && !tie_off_trans_complete_q[i] && fifo_full[i]) begin + conflict_counter_d = conflict_counter_q+1; + end + // FIFO is not full, safe to push + if (|conflict_counter_q && !trans_complete_d[i] && !fifo_full[i] ) begin + push = 1'b1; + conflict_counter_d = conflict_counter_q-1; + end + end +`else + assign push = trans_complete_d[i]; +`endif + end always_comb begin @@ -106,6 +149,7 @@ module idma_distributed_midend #( assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; always_comb begin + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = src_addr; end else begin @@ -126,6 +170,23 @@ module idma_distributed_midend #( burst_req_o[i].dst = burst_req_i.dst; // Modify lower addresses bits and size if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin +`ifdef DAS + burst_req_o[i].num_bytes = (burst_req_i.num_bytes= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*rows_das_i*DmaRegionWidth; + end else begin + // L2 --> L1 + if (burst_req_i.num_bytes<=DmaRegionWidth )begin + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; + end else if (i==2) begin + burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; + end else if (i==3) begin + burst_req_o[i].src = burst_req_i.src+(i-1)*rows_das_i*DmaRegionWidth + DmaRegionWidth; + end + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; + end +`else // We are not involved in the transfer burst_req_o[i].src = '0; burst_req_o[i].dst = '0; @@ -137,6 +198,7 @@ module idma_distributed_midend #( if (valid[i]) begin tie_off_trans_complete_d[i] = 1'b1; end +`endif end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin // First (and potentially only) slice // Leave address as is @@ -146,6 +208,16 @@ module idma_distributed_midend #( burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; end end else begin +`ifdef DAS + // Round up the address to the next DMA boundary + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + end else begin + burst_req_o[i].src = burst_req_i.src+(i-start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits])*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; + burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; + end +`else // Round up the address to the next DMA boundary if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; @@ -154,6 +226,7 @@ module idma_distributed_midend #( burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth-start_addr; burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; end +`endif if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin // Middle slice // Emit a full-sized transfer @@ -172,9 +245,9 @@ module idma_distributed_midend #( automatic string str; if (rst_ni && valid_i && ready_o) begin str = "[idma_distributed_midend] Got request\n"; - str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); + str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size 0x%8x (%d)\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes, burst_req_i.num_bytes); for (int i = 0; i < NoMstPorts; i++) begin - str = $sformatf("%sOut %6d: From: 0x%8x To: 0x%8x with size %d\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes); + str = $sformatf("%sRequest Out %6d: From: 0x%8x To: 0x%8x with size 0x%8x (%d)\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes, burst_req_o[i].num_bytes); end f = $fopen("dma.log", "a"); $fwrite(f, str); diff --git a/hardware/deps/idma/src/midends/idma_split_midend.sv b/hardware/deps/idma/src/midends/idma_split_midend.sv index 42a21e2d2..73d27b62a 100644 --- a/hardware/deps/idma/src/midends/idma_split_midend.sv +++ b/hardware/deps/idma/src/midends/idma_split_midend.sv @@ -3,6 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Samuel Riedel +// Bowen Wang +// Marco Bertuletti `include "common_cells/registers.svh" @@ -11,11 +13,26 @@ module idma_split_midend #( parameter int unsigned DmaRegionStart = 32'h0000_0000, parameter int unsigned DmaRegionEnd = 32'h1000_0000, parameter int unsigned AddrWidth = 32, +`ifdef DAS + parameter int unsigned NumTiles = 64, + parameter int unsigned NumBanksPerTile = 32, + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned NumDASPartitions = 4, + parameter int unsigned DASStartAddr = 1024, + parameter int unsigned NumTilesPerDma = 16, +`endif parameter type burst_req_t = logic, parameter type meta_t = logic ) ( input logic clk_i, input logic rst_ni, +`ifdef DAS + // DAS signals + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] rows_das_i, + output logic [$clog2(NumTiles):0] rows_das_o, +`endif // Slave input burst_req_t burst_req_i, input logic valid_i, @@ -28,16 +45,13 @@ module idma_split_midend #( input meta_t meta_i ); + // ------ Parameter Settings ------ // localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); - typedef logic [AddrWidth-1:0] addr_t; - addr_t start_addr, end_addr; - logic req_valid; - - - // Handle Metadata + // ------ Handle Metadata ------ // // Forward idle signal and count the trans_comlete signal + logic req_valid; logic [31:0] num_trans_d, num_trans_q; assign meta_o.backend_idle = meta_i.backend_idle; @@ -56,16 +70,142 @@ module idma_split_midend #( end `FF(num_trans_q, num_trans_d, '0, clk_i, rst_ni) - // Split requests +`ifdef DAS + localparam TileDmaRegionWidth = DmaRegionWidth / NumTiles; + logic [AddrWidth-1:0] PartitionDmaRegionWidth; + localparam DmaBackendWidth = NumBanksPerTile*NumTilesPerDma*4; // 32banks*8Tiles*4bytes + + // ------ Address translation ------ // + // Only the address in L1 SPM will be scrambled + logic [AddrWidth-1:0] post_scramble_src; + logic [AddrWidth-1:0] post_scramble_dst; + logic [$clog2(NumTiles):0] tiles_das_src, tiles_das_dst, tiles_das_sel; + logic [$clog2(NumTiles):0] rows_das_src, rows_das_dst, rows_das_sel; + + assign tiles_das_sel = tiles_das_src | tiles_das_dst; + assign rows_das_sel = rows_das_src | rows_das_dst; + assign PartitionDmaRegionWidth = TileDmaRegionWidth * tiles_das_sel; + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_src ( + .address_i (burst_req_i.src), + .num_bytes_i (burst_req_i.num_bytes), + .tiles_das_i (tiles_das_i), + .rows_das_i (rows_das_i), + .start_das_i (start_das_i), + .tiles_das_o (tiles_das_src), + .rows_das_o (rows_das_src), + .address_o (post_scramble_src) + ); + + idma_address_scrambler #( + .AddrWidth (AddrWidth ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .NumDASPartitions (NumDASPartitions), + .TCDMSizePerBank (TCDMSizePerBank ) + ) i_idma_address_scrambler_dst ( + .address_i (burst_req_i.dst), + .num_bytes_i (burst_req_i.num_bytes), + .tiles_das_i (tiles_das_i), + .rows_das_i (rows_das_i), + .start_das_i (start_das_i), + .tiles_das_o (tiles_das_dst), + .rows_das_o (rows_das_dst), + .address_o (post_scramble_dst) + ); + + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; + logic spm2dram; + always_comb begin + spm2dram = 0; + if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin + start_addr = post_scramble_src; + spm2dram = 1; + end else begin + start_addr = post_scramble_dst; + spm2dram = 0; + end + end + + // ------ Partition Row Offset Computation ------ // + // A DAS partition maps data onto a 2D grid: `tiles_das` tiles wide, `rows_das` + // rows tall. Each row holds `PartitionDmaRegionWidth` bytes. Transfers must be + // split at row boundaries because consecutive rows are `DmaRegionWidth` apart + // in physical SPM (not contiguous), even though the DRAM side is contiguous. + + // log2(tiles_das_sel): number of tile-index bits in the active partition + logic [$clog2(NumTiles):0] log2_tiles; + // Bitmask to extract the byte offset within one partition row from the + // scrambled address. Width = DmaRegionAddressBits - (TileIdBits - log2_tiles). + logic [AddrWidth-1:0] row_offset_mask; + // Byte offset of the scrambled start address within its partition row. + // Used to compute how much data fits in the first (possibly partial) row. + addr_t start_row_offset; + + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log2_tiles ( + .in_i (tiles_das_sel), + .cnt_o (log2_tiles ), + .empty_o (/* Unused */ ) + ); + + assign row_offset_mask = {DmaRegionAddressBits{1'b1}} >> ($clog2(NumTiles) - log2_tiles); + assign start_row_offset = start_addr & row_offset_mask; + + // ------ Beat Counter and Row/Column Index ------ // + // The beat counter tracks sub-transfer progress through the partition's 2D + // layout. It encodes a (row_idx, col_idx) pair: + // - Lower log2(rows_das) bits = row_idx (cycles through rows first) + // - Upper bits = col_idx (advances after all rows complete) + // Transfer order: row0-col0, row1-col0, ..., rowN-col0, row0-col1, ... + logic [$clog2(NumTiles):0] beat_cnt_d, beat_cnt_q; + `FFARN(beat_cnt_q, beat_cnt_d, '0, clk_i, rst_ni) + + // log2(rows_das_sel): number of row-index bits in the active partition + logic [$clog2(NumTiles):0] log2_rows; + // Bitmask to extract row index from the beat counter (lower log2_rows bits) + logic [$clog2(NumTiles):0] row_idx_mask; + // Current row index within the partition (0 .. rows_das-1) + logic [$clog2(NumTiles):0] row_idx; + // Current column index: how many full row sweeps have completed + logic [$clog2(NumTiles):0] col_idx; + + lzc #( + .WIDTH ($clog2(NumTiles)+1), + .MODE (1'b0 ) + ) i_log2_rows ( + .in_i (rows_das_sel), + .cnt_o (log2_rows ), + .empty_o (/* Unused */ ) + ); + + assign col_idx = beat_cnt_q >> log2_rows; + assign row_idx_mask = ~( {($clog2(NumTiles) + 1){1'b1}} << log2_rows ); + assign row_idx = beat_cnt_q & row_idx_mask; +`else + // ------ Filter out address in L1 SPM ------ // + addr_t start_addr; always_comb begin if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin start_addr = burst_req_i.src; end else begin start_addr = burst_req_i.dst; end - end_addr = start_addr + burst_req_i.num_bytes; end +`endif + // ------ Split requests ------ // enum logic {Idle, Busy} state_d, state_q; burst_req_t req_d, req_q; @@ -80,9 +220,63 @@ module idma_split_midend #( ready_o = 1'b0; req_valid = 1'b0; +`ifdef DAS + rows_das_o = rows_das_sel; + beat_cnt_d = beat_cnt_q; + if (num_trans_q == 1 && num_trans_d == 0) begin + beat_cnt_d = 0; + end +`endif + unique case (state_q) Idle: begin - if (valid_i) begin // Splitting required. + if (valid_i) begin // Splitting required +`ifdef DAS + if ((PartitionDmaRegionWidth-start_row_offset) >= burst_req_i.num_bytes) begin + burst_req_o = burst_req_i; + // Address in SPM need to be translated back to physical address + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + end + valid_o = 1'b1; + ready_o = ready_i; + req_valid = ready_i; + end else begin + // Store and acknowledge + req_d = burst_req_i; + ready_o = 1'b1; + burst_req_o = burst_req_i; + // Calculate the size for the 1st burst + burst_req_o.num_bytes = PartitionDmaRegionWidth-start_row_offset; + // TODO (bowwang): parameterize + req_d.num_bytes = (tiles_das_sel <= NumTilesPerDma) ? (rows_das_sel*DmaBackendWidth) : (rows_das_sel*PartitionDmaRegionWidth); + if (spm2dram) begin + burst_req_o.src = post_scramble_src; + req_d.src = post_scramble_src; + end else begin + burst_req_o.dst = post_scramble_dst; + req_d.dst = post_scramble_dst; + end + valid_o = 1'b1; + // Modify the stored info after first beat sent + if (ready_i) begin + // TODO (bowwang): May not be mecessary to consider alignment + req_d.num_bytes -= PartitionDmaRegionWidth-start_row_offset; + if (spm2dram) begin + req_d.src += DmaRegionWidth-start_row_offset; + req_d.dst += PartitionDmaRegionWidth-start_row_offset; + end else begin + req_d.src += PartitionDmaRegionWidth-start_row_offset; + req_d.dst += DmaRegionWidth-start_row_offset; + end + req_valid = 1'b1; + beat_cnt_d = 1; + end + state_d = Busy; + end +`else if (DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0] >= burst_req_i.num_bytes) begin // No splitting required, just forward burst_req_o = burst_req_i; @@ -108,6 +302,7 @@ module idma_split_midend #( end state_d = Busy; end +`endif end end Busy: begin @@ -115,7 +310,41 @@ module idma_split_midend #( burst_req_o = req_q; valid_o = 1'b1; req_valid = ready_i; - if (req_q.num_bytes <= DmaRegionWidth) begin +`ifdef DAS + if ($unsigned(req_q.num_bytes) <= $unsigned(PartitionDmaRegionWidth)) begin + // Last split + if (ready_i) begin + state_d = Idle; + beat_cnt_d = beat_cnt_q + 1; + end + end else begin + burst_req_o.num_bytes = PartitionDmaRegionWidth; + if (ready_i) begin + req_d.num_bytes = req_q.num_bytes - PartitionDmaRegionWidth; + beat_cnt_d = beat_cnt_q + 1; + // SPM address stride: consecutive partition rows are DmaRegionWidth + // apart in physical memory. At the last row (row_idx == rows_das-1), + // wrap back to row 0 and advance to the next column within the row. + // DRAM address always advances contiguously by PartitionDmaRegionWidth. + if (spm2dram) begin + if (row_idx == rows_das_sel-1) begin + req_d.src = req_q.src + PartitionDmaRegionWidth - row_idx*DmaRegionWidth; + end else begin + req_d.src = req_q.src + DmaRegionWidth; + end + req_d.dst = req_q.dst + PartitionDmaRegionWidth; + end else begin + req_d.src = req_q.src + PartitionDmaRegionWidth; + if (row_idx == rows_das_sel-1) begin + req_d.dst = req_q.dst + PartitionDmaRegionWidth - row_idx*DmaRegionWidth; + end else begin + req_d.dst = req_q.dst + DmaRegionWidth; + end + end// spm2dram + end // ready_i + end +`else + if ($unsigned(req_q.num_bytes) <= $unsigned(DmaRegionWidth)) begin // Last split if (ready_i) begin state_d = Idle; @@ -129,6 +358,7 @@ module idma_split_midend #( req_d.dst = req_q.dst + DmaRegionWidth; end end +`endif end default: /*do nothing*/; endcase @@ -139,14 +369,14 @@ module idma_split_midend #( always_ff @(posedge clk_i or negedge rst_ni) begin automatic string str; if (rst_ni && valid_i && ready_o) begin - str = "[idma_split_midend] Got request\n"; + str = "\n\n[idma_split_midend] Got request\n"; str = $sformatf("%sSplit: Request in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); end if (rst_ni && valid_o && ready_i) begin - str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes); + str = $sformatf("Split: Out %6d: From: 0x%8x To: 0x%8x with size %d, start_addr 0x%8x.\n", num_trans_q, burst_req_o.src, burst_req_o.dst, burst_req_o.num_bytes, start_addr); f = $fopen("dma.log", "a"); $fwrite(f, str); $fclose(f); diff --git a/hardware/scripts/questa/wave_core.tcl b/hardware/scripts/questa/wave_core.tcl index 85340078d..bcc97e0a6 100644 --- a/hardware/scripts/questa/wave_core.tcl +++ b/hardware/scripts/questa/wave_core.tcl @@ -13,7 +13,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3][$4] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3][$4] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3][$4] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/gen_rtl_group/i_group/gen_sub_groups[$2]/gen_rtl_sg/i_sub_group/gen_tiles[$3]/i_tile/gen_cores[$4]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3][$4] -divider Instructions @@ -182,7 +182,7 @@ if {$config == {terapool}} { add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMReq add wave -noupdate -group core[$1][$2][$3] -group Params /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/RegisterTCDMResp add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/clk_i - add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_i + add wave -noupdate -group core[$1][$2][$3] /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/rst_ni add wave -noupdate -group core[$1][$2][$3] -radix unsigned /mempool_tb/dut/i_mempool_cluster/gen_groups[$1]/i_group/gen_tiles[$2]/i_tile/gen_cores[$3]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i add wave -noupdate -group core[$1][$2][$3] -divider Instructions diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv index d2c790a65..3ff440e84 100644 --- a/hardware/src/address_scrambler.sv +++ b/hardware/src/address_scrambler.sv @@ -7,18 +7,31 @@ // Current constraints: // Author: Samuel Riedel +// Author: Marco Bertuletti module address_scrambler #( parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, parameter int unsigned ByteOffset = 2, + parameter bit Bypass = 0, parameter int unsigned NumTiles = 2, parameter int unsigned NumBanksPerTile = 2, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024 + parameter int unsigned TCDMSizePerBank = 1024, + parameter int unsigned SeqMemSizePerTile = 4096, + parameter int unsigned NumDASPartitions = 4, + // Dependant parameters, do not change + parameter int unsigned RowsWidth = $clog2(TCDMSizePerBank) - ByteOffset + 1, + parameter int unsigned MaxPartitionRowWidth = $clog2(TCDMSizePerBank) - ByteOffset, // maximum half of L1 + parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, + parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles ) ( - input logic [AddrWidth-1:0] address_i, - output logic [AddrWidth-1:0] address_o + input logic [AddrWidth-1:0] address_i, + input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][MaxPartitionRowWidth-1:0] rows_das_i, + output logic [AddrWidth-1:0] address_o ); + // Stack Sequential Settings localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); localparam int unsigned TileIdBits = $clog2(NumTiles); localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); @@ -26,32 +39,102 @@ module address_scrambler #( localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - if (Bypass || NumTiles < 2) begin + if (Bypass || NumTiles < 2) begin: gen_bypass assign address_o = address_i; - end else begin + + end else begin: gen_scrambling + // ------ Stack Region Logic ------ // logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to - // Leave this part of the address unchanged - // The LSBs that correspond to the offset inside a tile. These are the byte offset (bank width) - // and the Bank offset (Number of Banks in tile) - assign address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; - // The MSBs that are outside of the sequential memory size. Currently the sequential memory size - // always starts at 0. These are all the MSBs up to SeqMemSizePerTile*NumTiles - assign address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; - // Scramble the middle part // Bits that would have gone to different tiles but now go to increasing lines in the same tile assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; // Bits that would // Bits that would have gone to increasing lines in the same tile but now go to different tiles assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; + // ------ Heap Sequential Signals ------ // + + // `tile_bits` : how many fixed TileID bits + // `row_bits` : how many bits need to swap to the start of Row Index + logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_bits; + logic [NumDASPartitions-1:0][$clog2(MaxPartitionRowWidth)-1:0] row_bits; + + for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index + lzc #( + .WIDTH ($clog2(NumTiles)+1 ), + .MODE (1'b0 ) + ) i_log_tile_bits ( + .in_i (tiles_das_i[i] ), + .cnt_o (tile_bits[i] ), + .empty_o (/* Unused */ ) + ); + lzc #( + .WIDTH (MaxPartitionRowWidth ), + .MODE (1'b0 ) + ) i_log_row_bits ( + .in_i (rows_das_i[i] ), + .cnt_o (row_bits[i] ), + .empty_o (/* Unused */ ) + ); + end + + logic [NumDASPartitions-1:0][AddrWidth-1:0] lsb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] row_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] prt_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] msb_addr; + logic [NumDASPartitions-1:0][AddrWidth-1:0] aligned_addr; + + // Narrow row-index field signals (MaxPartitionRowWidth-bit) to replace the + // 32-bit adders that were on the critical path. Only the row-index bits at + // [TileIdBits+ConstantBitsLSB +: MaxPartitionRowWidth] are affected by the + // subtract/add — all other bits pass through unchanged. + localparam int unsigned RowFieldLSB = TileIdBits + ConstantBitsLSB; + logic [NumDASPartitions-1:0][MaxPartitionRowWidth-1:0] start_row_field; + always_comb begin + // Default: Unscrambled - address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; - // If not in bypass mode and address is in sequential region and more than one tile - if (address_i < (NumTiles * SeqMemSizePerTile)) begin + address_o = address_i; + + // Stack Region + if (address_i < (NumTiles * SeqMemSizePerTile)) begin: gen_stack_scrambling + address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; + address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; + + // DAS address scrambling + end else begin: gen_das_scrambling + + for (int p = 0; p < NumDASPartitions; p++) begin + if ( (address_i >= start_das_i[p]) && (address_i < start_das_i[p]+MemSizePerRow*rows_das_i[p]) && (tiles_das_i[p] != NumTiles) ) begin + + lsb_addr[p] = address_i & ((1 << (tile_bits[p]+ConstantBitsLSB)) - 1); + msb_addr[p] = address_i & ~((1 << (row_bits[p]+TileIdBits+ConstantBitsLSB)) - 1); + + // Narrow subtract: extract the row-index field from the partition + // start address (masked to row_bits width via rows_das-1), then + // subtract from the address row field to get the partition-relative + // row index. Only MaxPartitionRowWidth bits (~8 bits) instead of 32. + start_row_field[p] = start_das_i[p][RowFieldLSB +: MaxPartitionRowWidth] + & (rows_das_i[p] - 1); + aligned_addr[p] = address_i; + aligned_addr[p][RowFieldLSB +: MaxPartitionRowWidth] = + address_i[RowFieldLSB +: MaxPartitionRowWidth] - start_row_field[p]; + + prt_addr[p] = (aligned_addr[p] >> row_bits[p] ) & (((1 << (TileIdBits - tile_bits[p])) - 1) << (ConstantBitsLSB + tile_bits[p])); + row_addr[p] = (aligned_addr[p] << (TileIdBits - tile_bits[p])) & (((1 << (row_bits[p]) ) - 1) << (TileIdBits + ConstantBitsLSB )); + address_o = msb_addr[p] | row_addr[p] | prt_addr[p] | lsb_addr[p]; + + // Narrow add: restore the absolute row-index offset. + // Only the row-index field is modified — MSB and LSB bits are + // already correct from msb_addr and lsb_addr. + address_o[RowFieldLSB +: MaxPartitionRowWidth] = + address_o[RowFieldLSB +: MaxPartitionRowWidth] + start_row_field[p]; + + end + end + end end end diff --git a/hardware/src/control_registers/control_registers.hjson b/hardware/src/control_registers/control_registers.hjson index 1ef33e86e..5a93c1087 100644 --- a/hardware/src/control_registers/control_registers.hjson +++ b/hardware/src/control_registers/control_registers.hjson @@ -22,6 +22,11 @@ type: "int", default: "8" } + { name: "NumDasPartitions", + desc: "Supported number of DAS partitions", + type: "int", + default: "4" + } ], regwidth: 32 registers: [ @@ -135,6 +140,46 @@ cname: "ro_cache_end" fields: [{ bits: "31:0" }] } + }, + { multireg: + { + name: "tiles_das" + desc: "DAS per region folding granularity. Number of Tiles per DAS partition." + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDasPartitions" + cname: "tiles_das" + fields: [{ bits: "31:0" }] + }, + }, + { multireg: + { + name: "start_das" + desc: "DAS per region allocated start address." + swaccess: "wo" + hwaccess: "hrw" + hwqe: "true" + // External because we want to define the reset from a parameter + hwext: "true" + count: "NumDasPartitions" + cname: "start_das" + fields: [{ bits: "31:0" }] + }, + }, + { multireg: + { + name: "rows_das" + desc: "DAS per region allocated size." + swaccess: "wo" + hwaccess: "hro" + hwqe: "false" + count: "NumDasPartitions" + cname: "rows_das" + fields: [{ bits: "31:0" }] + }, } ] } diff --git a/hardware/src/control_registers/control_registers_reg_pkg.sv b/hardware/src/control_registers/control_registers_reg_pkg.sv index c061a7b94..f5612d482 100644 --- a/hardware/src/control_registers/control_registers_reg_pkg.sv +++ b/hardware/src/control_registers/control_registers_reg_pkg.sv @@ -9,9 +9,10 @@ package control_registers_reg_pkg; // Param list parameter int ROCacheNumAddrRules = 4; parameter int MAX_NumGroups = 8; + parameter int NumDasPartitions = 4; // Address widths within the block - parameter int BlockAw = 7; + parameter int BlockAw = 8; //////////////////////////// // Typedefs for registers // @@ -64,6 +65,20 @@ package control_registers_reg_pkg; logic qe; } control_registers_reg2hw_ro_cache_end_mreg_t; + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_tiles_das_mreg_t; + + typedef struct packed { + logic [31:0] q; + logic qe; + } control_registers_reg2hw_start_das_mreg_t; + + typedef struct packed { + logic [31:0] q; + } control_registers_reg2hw_rows_das_mreg_t; + typedef struct packed { logic [31:0] d; } control_registers_hw2reg_tcdm_start_address_reg_t; @@ -84,56 +99,81 @@ package control_registers_reg_pkg; logic [31:0] d; } control_registers_hw2reg_ro_cache_end_mreg_t; + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_tiles_das_mreg_t; + + typedef struct packed { + logic [31:0] d; + } control_registers_hw2reg_start_das_mreg_t; + // Register -> HW type typedef struct packed { - control_registers_reg2hw_eoc_reg_t eoc; // [755:724] - control_registers_reg2hw_wake_up_reg_t wake_up; // [723:691] - control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [690:427] - control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [426:394] - control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [393:361] - control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [360:328] - control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [327:296] - control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [295:264] - control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [263:132] - control_registers_reg2hw_ro_cache_end_mreg_t [3:0] ro_cache_end; // [131:0] + control_registers_reg2hw_eoc_reg_t eoc; // [1147:1116] + control_registers_reg2hw_wake_up_reg_t wake_up; // [1115:1083] + control_registers_reg2hw_wake_up_tile_mreg_t [7:0] wake_up_tile; // [1082:819] + control_registers_reg2hw_wake_up_group_reg_t wake_up_group; // [818:786] + control_registers_reg2hw_wake_up_strd_reg_t wake_up_strd; // [785:753] + control_registers_reg2hw_wake_up_offst_reg_t wake_up_offst; // [752:720] + control_registers_reg2hw_ro_cache_enable_reg_t ro_cache_enable; // [719:688] + control_registers_reg2hw_ro_cache_flush_reg_t ro_cache_flush; // [687:656] + control_registers_reg2hw_ro_cache_start_mreg_t [3:0] ro_cache_start; // [655:524] + control_registers_reg2hw_ro_cache_end_mreg_t [3:0] ro_cache_end; // [523:392] + control_registers_reg2hw_tiles_das_mreg_t [3:0] tiles_das; // [391:260] + control_registers_reg2hw_start_das_mreg_t [3:0] start_das; // [259:128] + control_registers_reg2hw_rows_das_mreg_t [3:0] rows_das; // [127:0] } control_registers_reg2hw_t; // HW -> register type typedef struct packed { - control_registers_hw2reg_tcdm_start_address_reg_t tcdm_start_address; // [351:320] - control_registers_hw2reg_tcdm_end_address_reg_t tcdm_end_address; // [319:288] - control_registers_hw2reg_nr_cores_reg_reg_t nr_cores_reg; // [287:256] - control_registers_hw2reg_ro_cache_start_mreg_t [3:0] ro_cache_start; // [255:128] - control_registers_hw2reg_ro_cache_end_mreg_t [3:0] ro_cache_end; // [127:0] + control_registers_hw2reg_tcdm_start_address_reg_t tcdm_start_address; // [607:576] + control_registers_hw2reg_tcdm_end_address_reg_t tcdm_end_address; // [575:544] + control_registers_hw2reg_nr_cores_reg_reg_t nr_cores_reg; // [543:512] + control_registers_hw2reg_ro_cache_start_mreg_t [3:0] ro_cache_start; // [511:384] + control_registers_hw2reg_ro_cache_end_mreg_t [3:0] ro_cache_end; // [383:256] + control_registers_hw2reg_tiles_das_mreg_t [3:0] tiles_das; // [255:128] + control_registers_hw2reg_start_das_mreg_t [3:0] start_das; // [127:0] } control_registers_hw2reg_t; // Register offsets - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 7'h 0; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 7'h 4; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 7'h 8; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 7'h c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 7'h 10; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 7'h 14; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 7'h 18; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 7'h 1c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 7'h 20; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 7'h 24; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 7'h 28; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 7'h 2c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 7'h 30; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 7'h 34; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 7'h 38; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 7'h 3c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 7'h 40; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 7'h 44; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 7'h 48; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 7'h 4c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 7'h 50; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 7'h 54; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 7'h 58; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 7'h 5c; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 7'h 60; - parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 7'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_EOC_OFFSET = 8'h 0; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFSET = 8'h 4; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_0_OFFSET = 8'h 8; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_1_OFFSET = 8'h c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_2_OFFSET = 8'h 10; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_3_OFFSET = 8'h 14; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_4_OFFSET = 8'h 18; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_5_OFFSET = 8'h 1c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_6_OFFSET = 8'h 20; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_TILE_7_OFFSET = 8'h 24; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_GROUP_OFFSET = 8'h 28; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_STRD_OFFSET = 8'h 2c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_WAKE_UP_OFFST_OFFSET = 8'h 30; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_OFFSET = 8'h 34; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TCDM_END_ADDRESS_OFFSET = 8'h 38; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_NR_CORES_REG_OFFSET = 8'h 3c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_ENABLE_OFFSET = 8'h 40; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_FLUSH_OFFSET = 8'h 44; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_0_OFFSET = 8'h 48; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_1_OFFSET = 8'h 4c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_2_OFFSET = 8'h 50; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_START_3_OFFSET = 8'h 54; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_0_OFFSET = 8'h 58; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET = 8'h 5c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET = 8'h 60; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET = 8'h 64; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TILES_DAS_0_OFFSET = 8'h 68; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TILES_DAS_1_OFFSET = 8'h 6c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TILES_DAS_2_OFFSET = 8'h 70; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_TILES_DAS_3_OFFSET = 8'h 74; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_0_OFFSET = 8'h 78; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_1_OFFSET = 8'h 7c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_2_OFFSET = 8'h 80; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_START_DAS_3_OFFSET = 8'h 84; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_0_OFFSET = 8'h 88; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_1_OFFSET = 8'h 8c; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_2_OFFSET = 8'h 90; + parameter logic [BlockAw-1:0] CONTROL_REGISTERS_ROWS_DAS_3_OFFSET = 8'h 94; // Reset values for hwext registers and their fields parameter logic [31:0] CONTROL_REGISTERS_TCDM_START_ADDRESS_RESVAL = 32'h 0; @@ -147,6 +187,14 @@ package control_registers_reg_pkg; parameter logic [31:0] CONTROL_REGISTERS_RO_CACHE_END_1_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_RO_CACHE_END_2_RESVAL = 32'h 0; parameter logic [31:0] CONTROL_REGISTERS_RO_CACHE_END_3_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_TILES_DAS_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_TILES_DAS_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_TILES_DAS_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_TILES_DAS_3_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_0_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_1_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_2_RESVAL = 32'h 0; + parameter logic [31:0] CONTROL_REGISTERS_START_DAS_3_RESVAL = 32'h 0; // Register index typedef enum int { @@ -175,11 +223,23 @@ package control_registers_reg_pkg; CONTROL_REGISTERS_RO_CACHE_END_0, CONTROL_REGISTERS_RO_CACHE_END_1, CONTROL_REGISTERS_RO_CACHE_END_2, - CONTROL_REGISTERS_RO_CACHE_END_3 + CONTROL_REGISTERS_RO_CACHE_END_3, + CONTROL_REGISTERS_TILES_DAS_0, + CONTROL_REGISTERS_TILES_DAS_1, + CONTROL_REGISTERS_TILES_DAS_2, + CONTROL_REGISTERS_TILES_DAS_3, + CONTROL_REGISTERS_START_DAS_0, + CONTROL_REGISTERS_START_DAS_1, + CONTROL_REGISTERS_START_DAS_2, + CONTROL_REGISTERS_START_DAS_3, + CONTROL_REGISTERS_ROWS_DAS_0, + CONTROL_REGISTERS_ROWS_DAS_1, + CONTROL_REGISTERS_ROWS_DAS_2, + CONTROL_REGISTERS_ROWS_DAS_3 } control_registers_id_e; // Register width information to check illegal writes - parameter logic [3:0] CONTROL_REGISTERS_PERMIT [26] = '{ + parameter logic [3:0] CONTROL_REGISTERS_PERMIT [38] = '{ 4'b 1111, // index[ 0] CONTROL_REGISTERS_EOC 4'b 1111, // index[ 1] CONTROL_REGISTERS_WAKE_UP 4'b 1111, // index[ 2] CONTROL_REGISTERS_WAKE_UP_TILE_0 @@ -205,7 +265,19 @@ package control_registers_reg_pkg; 4'b 1111, // index[22] CONTROL_REGISTERS_RO_CACHE_END_0 4'b 1111, // index[23] CONTROL_REGISTERS_RO_CACHE_END_1 4'b 1111, // index[24] CONTROL_REGISTERS_RO_CACHE_END_2 - 4'b 1111 // index[25] CONTROL_REGISTERS_RO_CACHE_END_3 + 4'b 1111, // index[25] CONTROL_REGISTERS_RO_CACHE_END_3 + 4'b 1111, // index[26] CONTROL_REGISTERS_TILES_DAS_0 + 4'b 1111, // index[27] CONTROL_REGISTERS_TILES_DAS_1 + 4'b 1111, // index[28] CONTROL_REGISTERS_TILES_DAS_2 + 4'b 1111, // index[29] CONTROL_REGISTERS_TILES_DAS_3 + 4'b 1111, // index[30] CONTROL_REGISTERS_START_DAS_0 + 4'b 1111, // index[31] CONTROL_REGISTERS_START_DAS_1 + 4'b 1111, // index[32] CONTROL_REGISTERS_START_DAS_2 + 4'b 1111, // index[33] CONTROL_REGISTERS_START_DAS_3 + 4'b 1111, // index[34] CONTROL_REGISTERS_ROWS_DAS_0 + 4'b 1111, // index[35] CONTROL_REGISTERS_ROWS_DAS_1 + 4'b 1111, // index[36] CONTROL_REGISTERS_ROWS_DAS_2 + 4'b 1111 // index[37] CONTROL_REGISTERS_ROWS_DAS_3 }; endpackage diff --git a/hardware/src/control_registers/control_registers_reg_top.sv b/hardware/src/control_registers/control_registers_reg_top.sv index b089e4c08..dad4d2c9d 100644 --- a/hardware/src/control_registers/control_registers_reg_top.sv +++ b/hardware/src/control_registers/control_registers_reg_top.sv @@ -10,7 +10,7 @@ module control_registers_reg_top #( parameter type reg_req_t = logic, parameter type reg_rsp_t = logic, - parameter int AW = 7 + parameter int AW = 8 ) ( input logic clk_i, input logic rst_ni, @@ -139,6 +139,30 @@ module control_registers_reg_top #( logic [31:0] ro_cache_end_3_wd; logic ro_cache_end_3_we; logic ro_cache_end_3_re; + logic [31:0] tiles_das_0_wd; + logic tiles_das_0_we; + logic [31:0] tiles_das_1_wd; + logic tiles_das_1_we; + logic [31:0] tiles_das_2_wd; + logic tiles_das_2_we; + logic [31:0] tiles_das_3_wd; + logic tiles_das_3_we; + logic [31:0] start_das_0_wd; + logic start_das_0_we; + logic [31:0] start_das_1_wd; + logic start_das_1_we; + logic [31:0] start_das_2_wd; + logic start_das_2_we; + logic [31:0] start_das_3_wd; + logic start_das_3_we; + logic [31:0] rows_das_0_wd; + logic rows_das_0_we; + logic [31:0] rows_das_1_wd; + logic rows_das_1_we; + logic [31:0] rows_das_2_wd; + logic rows_das_2_we; + logic [31:0] rows_das_3_wd; + logic rows_das_3_we; // Register instances // R[eoc]: V(False) @@ -717,8 +741,246 @@ module control_registers_reg_top #( + // Subregister 0 of Multireg tiles_das + // R[tiles_das_0]: V(True) - logic [25:0] addr_hit; + prim_subreg_ext #( + .DW (32) + ) u_tiles_das_0 ( + .re (1'b0), + .we (tiles_das_0_we), + .wd (tiles_das_0_wd), + .d (hw2reg.tiles_das[0].d), + .qre (), + .qe (reg2hw.tiles_das[0].qe), + .q (reg2hw.tiles_das[0].q ), + .qs () + ); + + // Subregister 1 of Multireg tiles_das + // R[tiles_das_1]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_tiles_das_1 ( + .re (1'b0), + .we (tiles_das_1_we), + .wd (tiles_das_1_wd), + .d (hw2reg.tiles_das[1].d), + .qre (), + .qe (reg2hw.tiles_das[1].qe), + .q (reg2hw.tiles_das[1].q ), + .qs () + ); + + // Subregister 2 of Multireg tiles_das + // R[tiles_das_2]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_tiles_das_2 ( + .re (1'b0), + .we (tiles_das_2_we), + .wd (tiles_das_2_wd), + .d (hw2reg.tiles_das[2].d), + .qre (), + .qe (reg2hw.tiles_das[2].qe), + .q (reg2hw.tiles_das[2].q ), + .qs () + ); + + // Subregister 3 of Multireg tiles_das + // R[tiles_das_3]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_tiles_das_3 ( + .re (1'b0), + .we (tiles_das_3_we), + .wd (tiles_das_3_wd), + .d (hw2reg.tiles_das[3].d), + .qre (), + .qe (reg2hw.tiles_das[3].qe), + .q (reg2hw.tiles_das[3].q ), + .qs () + ); + + + + // Subregister 0 of Multireg start_das + // R[start_das_0]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_0 ( + .re (1'b0), + .we (start_das_0_we), + .wd (start_das_0_wd), + .d (hw2reg.start_das[0].d), + .qre (), + .qe (reg2hw.start_das[0].qe), + .q (reg2hw.start_das[0].q ), + .qs () + ); + + // Subregister 1 of Multireg start_das + // R[start_das_1]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_1 ( + .re (1'b0), + .we (start_das_1_we), + .wd (start_das_1_wd), + .d (hw2reg.start_das[1].d), + .qre (), + .qe (reg2hw.start_das[1].qe), + .q (reg2hw.start_das[1].q ), + .qs () + ); + + // Subregister 2 of Multireg start_das + // R[start_das_2]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_2 ( + .re (1'b0), + .we (start_das_2_we), + .wd (start_das_2_wd), + .d (hw2reg.start_das[2].d), + .qre (), + .qe (reg2hw.start_das[2].qe), + .q (reg2hw.start_das[2].q ), + .qs () + ); + + // Subregister 3 of Multireg start_das + // R[start_das_3]: V(True) + + prim_subreg_ext #( + .DW (32) + ) u_start_das_3 ( + .re (1'b0), + .we (start_das_3_we), + .wd (start_das_3_wd), + .d (hw2reg.start_das[3].d), + .qre (), + .qe (reg2hw.start_das[3].qe), + .q (reg2hw.start_das[3].q ), + .qs () + ); + + + + // Subregister 0 of Multireg rows_das + // R[rows_das_0]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_0 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_0_we), + .wd (rows_das_0_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[0].q ), + + .qs () + ); + + // Subregister 1 of Multireg rows_das + // R[rows_das_1]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_1 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_1_we), + .wd (rows_das_1_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[1].q ), + + .qs () + ); + + // Subregister 2 of Multireg rows_das + // R[rows_das_2]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_2 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_2_we), + .wd (rows_das_2_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[2].q ), + + .qs () + ); + + // Subregister 3 of Multireg rows_das + // R[rows_das_3]: V(False) + + prim_subreg #( + .DW (32), + .SWACCESS("WO"), + .RESVAL (32'h0) + ) u_rows_das_3 ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + + // from register interface + .we (rows_das_3_we), + .wd (rows_das_3_wd), + + // from internal hardware + .de (1'b0), + .d ('0 ), + + // to internal hardware + .qe (), + .q (reg2hw.rows_das[3].q ), + + .qs () + ); + + + + + logic [37:0] addr_hit; always_comb begin addr_hit = '0; addr_hit[ 0] = (reg_addr == CONTROL_REGISTERS_EOC_OFFSET); @@ -747,6 +1009,18 @@ module control_registers_reg_top #( addr_hit[23] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_1_OFFSET); addr_hit[24] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_2_OFFSET); addr_hit[25] = (reg_addr == CONTROL_REGISTERS_RO_CACHE_END_3_OFFSET); + addr_hit[26] = (reg_addr == CONTROL_REGISTERS_TILES_DAS_0_OFFSET); + addr_hit[27] = (reg_addr == CONTROL_REGISTERS_TILES_DAS_1_OFFSET); + addr_hit[28] = (reg_addr == CONTROL_REGISTERS_TILES_DAS_2_OFFSET); + addr_hit[29] = (reg_addr == CONTROL_REGISTERS_TILES_DAS_3_OFFSET); + addr_hit[30] = (reg_addr == CONTROL_REGISTERS_START_DAS_0_OFFSET); + addr_hit[31] = (reg_addr == CONTROL_REGISTERS_START_DAS_1_OFFSET); + addr_hit[32] = (reg_addr == CONTROL_REGISTERS_START_DAS_2_OFFSET); + addr_hit[33] = (reg_addr == CONTROL_REGISTERS_START_DAS_3_OFFSET); + addr_hit[34] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_0_OFFSET); + addr_hit[35] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_1_OFFSET); + addr_hit[36] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_2_OFFSET); + addr_hit[37] = (reg_addr == CONTROL_REGISTERS_ROWS_DAS_3_OFFSET); end assign addrmiss = (reg_re || reg_we) ? ~|addr_hit : 1'b0 ; @@ -779,7 +1053,19 @@ module control_registers_reg_top #( (addr_hit[22] & (|(CONTROL_REGISTERS_PERMIT[22] & ~reg_be))) | (addr_hit[23] & (|(CONTROL_REGISTERS_PERMIT[23] & ~reg_be))) | (addr_hit[24] & (|(CONTROL_REGISTERS_PERMIT[24] & ~reg_be))) | - (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))))); + (addr_hit[25] & (|(CONTROL_REGISTERS_PERMIT[25] & ~reg_be))) | + (addr_hit[26] & (|(CONTROL_REGISTERS_PERMIT[26] & ~reg_be))) | + (addr_hit[27] & (|(CONTROL_REGISTERS_PERMIT[27] & ~reg_be))) | + (addr_hit[28] & (|(CONTROL_REGISTERS_PERMIT[28] & ~reg_be))) | + (addr_hit[29] & (|(CONTROL_REGISTERS_PERMIT[29] & ~reg_be))) | + (addr_hit[30] & (|(CONTROL_REGISTERS_PERMIT[30] & ~reg_be))) | + (addr_hit[31] & (|(CONTROL_REGISTERS_PERMIT[31] & ~reg_be))) | + (addr_hit[32] & (|(CONTROL_REGISTERS_PERMIT[32] & ~reg_be))) | + (addr_hit[33] & (|(CONTROL_REGISTERS_PERMIT[33] & ~reg_be))) | + (addr_hit[34] & (|(CONTROL_REGISTERS_PERMIT[34] & ~reg_be))) | + (addr_hit[35] & (|(CONTROL_REGISTERS_PERMIT[35] & ~reg_be))) | + (addr_hit[36] & (|(CONTROL_REGISTERS_PERMIT[36] & ~reg_be))) | + (addr_hit[37] & (|(CONTROL_REGISTERS_PERMIT[37] & ~reg_be))))); end assign eoc_we = addr_hit[0] & reg_we & !reg_error; @@ -865,6 +1151,42 @@ module control_registers_reg_top #( assign ro_cache_end_3_wd = reg_wdata[31:0]; assign ro_cache_end_3_re = addr_hit[25] & reg_re & !reg_error; + assign tiles_das_0_we = addr_hit[26] & reg_we & !reg_error; + assign tiles_das_0_wd = reg_wdata[31:0]; + + assign tiles_das_1_we = addr_hit[27] & reg_we & !reg_error; + assign tiles_das_1_wd = reg_wdata[31:0]; + + assign tiles_das_2_we = addr_hit[28] & reg_we & !reg_error; + assign tiles_das_2_wd = reg_wdata[31:0]; + + assign tiles_das_3_we = addr_hit[29] & reg_we & !reg_error; + assign tiles_das_3_wd = reg_wdata[31:0]; + + assign start_das_0_we = addr_hit[30] & reg_we & !reg_error; + assign start_das_0_wd = reg_wdata[31:0]; + + assign start_das_1_we = addr_hit[31] & reg_we & !reg_error; + assign start_das_1_wd = reg_wdata[31:0]; + + assign start_das_2_we = addr_hit[32] & reg_we & !reg_error; + assign start_das_2_wd = reg_wdata[31:0]; + + assign start_das_3_we = addr_hit[33] & reg_we & !reg_error; + assign start_das_3_wd = reg_wdata[31:0]; + + assign rows_das_0_we = addr_hit[34] & reg_we & !reg_error; + assign rows_das_0_wd = reg_wdata[31:0]; + + assign rows_das_1_we = addr_hit[35] & reg_we & !reg_error; + assign rows_das_1_wd = reg_wdata[31:0]; + + assign rows_das_2_we = addr_hit[36] & reg_we & !reg_error; + assign rows_das_2_wd = reg_wdata[31:0]; + + assign rows_das_3_we = addr_hit[37] & reg_we & !reg_error; + assign rows_das_3_wd = reg_wdata[31:0]; + // Read data return always_comb begin reg_rdata_next = '0; @@ -973,6 +1295,54 @@ module control_registers_reg_top #( reg_rdata_next[31:0] = ro_cache_end_3_qs; end + addr_hit[26]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[27]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[28]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[29]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[30]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[31]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[32]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[33]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[34]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[35]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[36]: begin + reg_rdata_next[31:0] = '0; + end + + addr_hit[37]: begin + reg_rdata_next[31:0] = '0; + end + default: begin reg_rdata_next = '1; end @@ -995,7 +1365,7 @@ endmodule module control_registers_reg_top_intf #( - parameter int AW = 7, + parameter int AW = 8, localparam int DW = 32 ) ( input logic clk_i, diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 03cbe1bbb..220befbc1 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -7,6 +7,10 @@ module ctrl_registers import mempool_pkg::ro_cache_ctrl_t; + import mempool_pkg::NumDASPartitions; + import mempool_pkg::TileInterleavingWidth; + import mempool_pkg::RowsInterleavingWidth; + import mempool_pkg::AddrWidth; #( parameter int DataWidth = 32, // Parameters @@ -17,16 +21,19 @@ module ctrl_registers parameter type axi_lite_req_t = logic, parameter type axi_lite_resp_t = logic ) ( - input logic clk_i, - input logic rst_ni, + input logic clk_i, + input logic rst_ni, // AXI Bus - input axi_lite_req_t axi_lite_slave_req_i, - output axi_lite_resp_t axi_lite_slave_resp_o, + input axi_lite_req_t axi_lite_slave_req_i, + output axi_lite_resp_t axi_lite_slave_resp_o, // Control registers - output logic [DataWidth-1:0] eoc_o, - output logic eoc_valid_o, - output logic [NumCores-1:0] wake_up_o, - output ro_cache_ctrl_t ro_cache_ctrl_o + output logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das_o, + output logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_o, + output logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_o, + output logic [DataWidth-1:0] eoc_o, + output logic eoc_valid_o, + output logic [NumCores-1:0] wake_up_o, + output ro_cache_ctrl_t ro_cache_ctrl_o ); import mempool_pkg::AddrWidth; @@ -98,6 +105,14 @@ module ctrl_registers `FFL(ctrl_hw2reg.ro_cache_end[i].d, ctrl_reg2hw.ro_cache_end[i].q, ctrl_reg2hw.ro_cache_end[i].qe, ro_cache_regions[i].end_addr, clk_i, rst_ni) end + for (genvar i = 0; i < mempool_pkg::NumDASPartitions; i++) begin: gen_das_regs + `FFL(ctrl_hw2reg.tiles_das[i].d, ctrl_reg2hw.tiles_das[i].q, ctrl_reg2hw.tiles_das[i].qe, mempool_pkg::NumTiles); + `FFL(ctrl_hw2reg.start_das[i].d, ctrl_reg2hw.start_das[i].q, ctrl_reg2hw.start_das[i].qe, mempool_pkg::DASStartAddr); + assign tiles_das_o[i] = ctrl_hw2reg.tiles_das[i].d[TileInterleavingWidth-1:0]; + assign start_das_o[i] = ctrl_hw2reg.start_das[i].d; + assign rows_das_o[i] = ctrl_reg2hw.rows_das[i].q[RowsInterleavingWidth-1:0]; + end + /************************ * Wakeup Pulse Logic * ************************/ diff --git a/hardware/src/mempool_cluster.sv b/hardware/src/mempool_cluster.sv index 561e0d369..007320ac7 100644 --- a/hardware/src/mempool_cluster.sv +++ b/hardware/src/mempool_cluster.sv @@ -37,6 +37,13 @@ module mempool_cluster // AXI Interface output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o, input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i +`ifdef DAS + , + // DAS partition configuration + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i +`endif ); /********************* @@ -73,6 +80,7 @@ module mempool_cluster `FF(dma_meta_o, dma_meta_cut, '0, clk_i, rst_ni); + dma_req_t dma_req_split; logic dma_req_split_valid; logic dma_req_split_ready; @@ -81,27 +89,40 @@ module mempool_cluster logic [NumGroups-1:0] dma_req_group_valid, dma_req_group_q_valid; logic [NumGroups-1:0] dma_req_group_ready, dma_req_group_q_ready; dma_meta_t [NumGroups-1:0] dma_meta, dma_meta_q; + logic [RowsInterleavingWidth-1:0] dma_rows_das; `FF(dma_meta_q, dma_meta, '0, clk_i, rst_ni); idma_split_midend #( - .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .AddrWidth (AddrWidth ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) + .DmaRegionWidth (NumBanksPerGroup*NumGroups*4), // #DmaBytes = #banks*4 = 4096*4 // size per row + .DmaRegionStart (TCDMBaseAddr ), // 0x0000_0000, defined in tb + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), // TCDMSize = #banks*l1banksize = 4096*1024 // size of DMA region + .AddrWidth (AddrWidth ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ), + .NumTiles (NumTiles ), + .NumBanksPerTile (NumBanksPerTile ), + .TCDMSizePerBank (TCDMSizePerBank ), + .NumDASPartitions (NumDASPartitions ), + .NumTilesPerDma (NumTilesPerDma ), + .DASStartAddr (DASStartAddr ) ) i_idma_split_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i(dma_req_cut ), - .valid_i (dma_req_cut_valid ), - .ready_o (dma_req_cut_ready ), - .meta_o (dma_meta_cut ), - .burst_req_o(dma_req_split ), - .valid_o (dma_req_split_valid), - .ready_i (dma_req_split_ready), - .meta_i (dma_meta_split ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .rows_das_o (dma_rows_das ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req_split ), + .valid_o (dma_req_split_valid), + .ready_i (dma_req_split_ready), + .meta_i (dma_meta_split ) ); idma_distributed_midend #( @@ -110,11 +131,16 @@ module mempool_cluster .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize), .TransFifoDepth (16 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( .clk_i (clk_i ), .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das ), +`endif .burst_req_i (dma_req_split ), .valid_i (dma_req_split_valid), .ready_o (dma_req_split_ready), @@ -294,6 +320,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -335,6 +367,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -373,6 +411,12 @@ module mempool_cluster .tcdm_slave_resp_o (tcdm_slave_resp[g] ), .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request @@ -435,38 +479,44 @@ module mempool_cluster .TCDMBaseAddr (TCDMBaseAddr ), .BootAddr (BootAddr ) ) i_group ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (testmode_i ), - .scan_enable_i (scan_enable_i ), - .scan_data_i (/* Unconnected */ ), - .scan_data_o (/* Unconnected */ ), - .group_id_i (g[idx_width(NumGroups)-1:0] ), + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (testmode_i ), + .scan_enable_i (scan_enable_i ), + .scan_data_i (/* Unconnected */ ), + .scan_data_o (/* Unconnected */ ), + .group_id_i (g[idx_width(NumGroups)-1:0] ), // TCDM Master interfaces - .tcdm_master_req_o (tcdm_master_req[g] ), - .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), - .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), - .tcdm_master_resp_i (tcdm_master_resp[g] ), - .tcdm_master_resp_valid_i(tcdm_master_resp_valid[g] ), - .tcdm_master_resp_ready_o(tcdm_master_resp_ready[g] ), + .tcdm_master_req_o (tcdm_master_req[g] ), + .tcdm_master_req_valid_o (tcdm_master_req_valid[g] ), + .tcdm_master_req_ready_i (tcdm_master_req_ready[g] ), + .tcdm_master_resp_i (tcdm_master_resp[g] ), + .tcdm_master_resp_valid_i (tcdm_master_resp_valid[g] ), + .tcdm_master_resp_ready_o (tcdm_master_resp_ready[g] ), // TCDM banks interface - .tcdm_slave_req_i (tcdm_slave_req[g] ), - .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), - .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), - .tcdm_slave_resp_o (tcdm_slave_resp[g] ), - .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), - .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), - .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), - .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), + .tcdm_slave_req_i (tcdm_slave_req[g] ), + .tcdm_slave_req_valid_i (tcdm_slave_req_valid[g] ), + .tcdm_slave_req_ready_o (tcdm_slave_req_ready[g] ), + .tcdm_slave_resp_o (tcdm_slave_resp[g] ), + .tcdm_slave_resp_valid_o (tcdm_slave_resp_valid[g] ), + .tcdm_slave_resp_ready_i (tcdm_slave_resp_ready[g] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), + .dma_rows_das_i (dma_rows_das ), +`endif + .wake_up_i (wake_up_q[g*NumCoresPerGroup +: NumCoresPerGroup] ), + .ro_cache_ctrl_i (ro_cache_ctrl_q[g] ), // DMA request - .dma_req_i (dma_req_group_q[g] ), - .dma_req_valid_i (dma_req_group_q_valid[g] ), - .dma_req_ready_o (dma_req_group_q_ready[g] ), + .dma_req_i (dma_req_group_q[g] ), + .dma_req_valid_i (dma_req_group_q_valid[g] ), + .dma_req_ready_o (dma_req_group_q_ready[g] ), // DMA status - .dma_meta_o (dma_meta[g] ), + .dma_meta_o (dma_meta[g] ), // AXI interface - .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), - .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) + .axi_mst_req_o (axi_mst_req_o[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup] ), + .axi_mst_resp_i (axi_mst_resp_i[g*NumAXIMastersPerGroup +: NumAXIMastersPerGroup]) ); end : gen_groups diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 733f98b9c..3b18fa32e 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -59,18 +59,26 @@ module mempool_group input logic [NumGroups-1:1][NumTilesPerGroup-1:0] tcdm_slave_resp_ready_i, `endif // Wake up interface - input logic [NumCoresPerGroup-1:0] wake_up_i, + input logic [NumCoresPerGroup-1:0] wake_up_i, // RO-Cache configuration - input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, + input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // DMA request - input `STRUCT_PORT(dma_req_t) dma_req_i, - input logic dma_req_valid_i, - output logic dma_req_ready_o, + input `STRUCT_PORT(dma_req_t) dma_req_i, + input logic dma_req_valid_i, + output logic dma_req_ready_o, // DMA status - output `STRUCT_PORT(dma_meta_t) dma_meta_o, + output `STRUCT_PORT(dma_meta_t) dma_meta_o, // AXI Interface - output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, - input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i + output `STRUCT_VECT(axi_tile_req_t, [NumAXIMastersPerGroup-1:0]) axi_mst_req_o, + input `STRUCT_VECT(axi_tile_resp_t, [NumAXIMastersPerGroup-1:0]) axi_mst_resp_i +`ifdef DAS + , + // DAS partition configuration + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i, + input logic [RowsInterleavingWidth-1:0] dma_rows_das_i +`endif ); /***************** @@ -332,6 +340,11 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); @@ -384,6 +397,11 @@ module mempool_group .axi_mst_resp_i (axi_mst_resp[sg*NumAXIMastersPerSubGroup +: NumAXIMastersPerSubGroup] ), // RO-Cache configuration .ro_cache_ctrl_i (ro_cache_ctrl_q ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[sg*NumCoresPerSubGroup +: NumCoresPerSubGroup] ) ); @@ -565,20 +583,25 @@ module mempool_group .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), .DmaRegionStart (TCDMBaseAddr ), .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), + .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), .burst_req_t (dma_req_t ), .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das_i ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); `else @@ -683,6 +706,11 @@ module mempool_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); @@ -971,24 +999,29 @@ module mempool_group dma_meta_t [NumDmasPerGroup-1:0] dma_meta; idma_distributed_midend #( - .NoMstPorts (NumDmasPerGroup ), - .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup), - .DmaRegionStart (TCDMBaseAddr ), - .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), - .TransFifoDepth (16 ), - .burst_req_t (dma_req_t ), - .meta_t (dma_meta_t ) + .NoMstPorts (NumDmasPerGroup ), + .DmaRegionWidth (NumBanksPerGroup*4/NumDmasPerGroup ), + .DmaRegionStart (TCDMBaseAddr ), + .DmaRegionEnd (TCDMBaseAddr+TCDMSize ), + .TransFifoDepth (8 ), + .NumTiles (NumTiles ), + .NumDASPartitions (NumDASPartitions ), + .burst_req_t (dma_req_t ), + .meta_t (dma_meta_t ) ) i_idma_distributed_midend ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .burst_req_i (dma_req_cut ), - .valid_i (dma_req_cut_valid), - .ready_o (dma_req_cut_ready), - .meta_o (dma_meta_cut ), - .burst_req_o (dma_req ), - .valid_o (dma_req_valid ), - .ready_i (dma_req_ready ), - .meta_i (dma_meta ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), +`ifdef DAS + .rows_das_i (dma_rows_das_i ), +`endif + .burst_req_i (dma_req_cut ), + .valid_i (dma_req_cut_valid ), + .ready_o (dma_req_cut_ready ), + .meta_o (dma_meta_cut ), + .burst_req_o (dma_req ), + .valid_o (dma_req_valid ), + .ready_i (dma_req_ready ), + .meta_i (dma_meta ) ); // xbar diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 5d427feec..cf6877959 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -44,6 +44,11 @@ package mempool_pkg; localparam integer unsigned NumBanksPerGroup = NumBanks / NumGroups; localparam integer unsigned TCDMAddrMemWidth = $clog2(TCDMSizePerBank / mempool_pkg::BeWidth); localparam integer unsigned TCDMAddrWidth = TCDMAddrMemWidth + idx_width(NumBanksPerGroup); + // DAS parameters + localparam integer unsigned NumDASPartitions = `ifdef NUM_DAS_PARTITIONS `NUM_DAS_PARTITIONS `else 0 `endif; + localparam integer unsigned DASStartAddr = (NumBanks * TCDMSizePerBank) - NumCores * (`ifdef DAS_MEM_SIZE `DAS_MEM_SIZE `else 0 `endif); + localparam integer unsigned TileInterleavingWidth = idx_width(NumTiles) + 1; // only support {128, 64, 32, 16, 8, 4, 2, 1}; + localparam integer unsigned RowsInterleavingWidth = idx_width(TCDMSizePerBank) - ByteOffset; // L2 localparam integer unsigned L2Size = `ifdef L2_SIZE `L2_SIZE `else 0 `endif; // [B] diff --git a/hardware/src/mempool_sub_group.sv b/hardware/src/mempool_sub_group.sv index a3577450f..e09e3b1f5 100644 --- a/hardware/src/mempool_sub_group.sv +++ b/hardware/src/mempool_sub_group.sv @@ -64,6 +64,13 @@ module mempool_sub_group input `STRUCT_PORT(ro_cache_ctrl_t) ro_cache_ctrl_i, // Wake up interface input logic [NumCoresPerSubGroup-1:0] wake_up_i +`ifdef DAS + , + // DAS partition configuration + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i +`endif ); /***************** @@ -198,6 +205,11 @@ module mempool_sub_group // AXI interface .axi_mst_req_o (axi_tile_req[t] ), .axi_mst_resp_i (axi_tile_resp[t] ), +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`endif // Wake up interface .wake_up_i (wake_up_q[t*NumCoresPerTile +: NumCoresPerTile]) ); diff --git a/hardware/src/mempool_system.sv b/hardware/src/mempool_system.sv index c4610486f..acc87ba62 100644 --- a/hardware/src/mempool_system.sv +++ b/hardware/src/mempool_system.sv @@ -90,6 +90,12 @@ module mempool_system logic [NumCores-1:0] wake_up; logic [DataWidth-1:0] eoc; ro_cache_ctrl_t ro_cache_ctrl; +`ifdef DAS + // For dynamic partitioning + logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das; + logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das; + logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das; +`endif dma_req_t dma_req; logic dma_req_valid; @@ -137,20 +143,25 @@ module mempool_system .TCDMBaseAddr(TCDMBaseAddr), .BootAddr (BootAddr ) ) i_mempool_cluster ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .wake_up_i (wake_up ), - .testmode_i (1'b0 ), - .scan_enable_i (1'b0 ), - .scan_data_i (1'b0 ), - .scan_data_o (/* Unused */ ), - .ro_cache_ctrl_i(ro_cache_ctrl ), - .dma_req_i (dma_req ), - .dma_req_valid_i(dma_req_valid ), - .dma_req_ready_o(dma_req_ready ), - .dma_meta_o (dma_meta ), - .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), - .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .wake_up_i (wake_up ), +`ifdef DAS + .tiles_das_i (tiles_das ), + .start_das_i (start_das ), + .rows_das_i (rows_das ), +`endif + .testmode_i (1'b0 ), + .scan_enable_i (1'b0 ), + .scan_data_i (1'b0 ), + .scan_data_o (/* Unused */ ), + .ro_cache_ctrl_i (ro_cache_ctrl ), + .dma_req_i (dma_req ), + .dma_req_valid_i (dma_req_valid ), + .dma_req_ready_o (dma_req_ready ), + .dma_meta_o (dma_meta ), + .axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ), + .axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0]) ); /********************** @@ -801,6 +812,11 @@ module mempool_system .axi_lite_slave_resp_o(axi_lite_slv_resp[CtrlRegisters]), .eoc_o (/* Unused */ ), .eoc_valid_o (eoc_valid_o ), +`ifdef DAS + .tiles_das_o (tiles_das ), + .start_das_o (start_das ), + .rows_das_o (rows_das ), +`endif .wake_up_o (wake_up ), .ro_cache_ctrl_o (ro_cache_ctrl ) ); diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 49c8ddaea..ee87dc32d 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -53,6 +53,13 @@ module mempool_tile input `STRUCT_PORT(axi_tile_resp_t) axi_mst_resp_i, // Wake up interface input logic [NumCoresPerTile-1:0] wake_up_i +`ifdef DAS + , + // DAS partition configuration + input logic [NumDASPartitions-1:0][TileInterleavingWidth-1:0] tiles_das_i, + input logic [NumDASPartitions-1:0][AddrWidth-1:0] start_das_i, + input logic [NumDASPartitions-1:0][RowsInterleavingWidth-1:0] rows_das_i +`endif ); /**************** @@ -893,13 +900,24 @@ module mempool_tile address_scrambler #( .AddrWidth (AddrWidth ), .ByteOffset (ByteOffset ), + .Bypass (0 ), .NumTiles (NumTiles ), .NumBanksPerTile (NumBanksPerTile ), - .Bypass (0 ), - .SeqMemSizePerTile (SeqMemSizePerTile) + .TCDMSizePerBank (TCDMSizePerBank ), + .SeqMemSizePerTile (SeqMemSizePerTile), + .NumDASPartitions (NumDASPartitions ) ) i_address_scrambler ( - .address_i (snitch_data_qaddr[c] ), - .address_o (snitch_data_qaddr_scrambled) +`ifdef DAS + .tiles_das_i (tiles_das_i ), + .start_das_i (start_das_i ), + .rows_das_i (rows_das_i ), +`else + .tiles_das_i (NumTiles ), + .start_das_i ('0 ), + .rows_das_i ('0 ), +`endif + .address_i (snitch_data_qaddr[c]), + .address_o (snitch_data_qaddr_scrambled) ); if (!TrafficGeneration) begin: gen_tcdm_shim diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index 73953dc07..1b1f24fb3 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -55,4 +55,4 @@ clean: rm -vf $(LINKER_SCRIPT) rm -vf $(wildcard $(DATA_DIR)/data_*.h) -.INTERMEDIATE: $(addsuffix /main.c.o,$(APPS)) +.INTERMEDIATE: $(addsuffix /main.c.o,$(APPS)) \ No newline at end of file diff --git a/software/apps/baremetal/das_gemm_f32/main.c b/software/apps/baremetal/das_gemm_f32/main.c new file mode 100644 index 000000000..e090fa884 --- /dev/null +++ b/software/apps/baremetal/das_gemm_f32/main.c @@ -0,0 +1,92 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Bowen Wang +// Desc: GEMM f32 benchmark using DAS (Dynamic Address Scrambling) + +#include +#include + +#include "alloc.h" +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_das_gemm_f32.h" + +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_f32.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) + +// Tiles per DAS partition: NUM_TILES = fully interleaved (baseline) +#ifndef TILES_PER_PARTITION +#define TILES_PER_PARTITION NUM_TILES +#endif + +// Shared pointers for DAS-allocated matrices +float *volatile shared_a __attribute__((section(".l1"))); +float *volatile shared_b __attribute__((section(".l1"))); +float *volatile shared_c __attribute__((section(".l1"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + mempool_init(core_id); + mempool_barrier_init(core_id); + + uint32_t a_size = matrix_M * matrix_N * sizeof(float); + uint32_t b_size = matrix_N * matrix_P * sizeof(float); + uint32_t c_size = matrix_M * matrix_P * sizeof(float); + + if (core_id == 0) { + // Initialize DAS dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + alloc_t *das_alloc = get_dynamic_heap_alloc(); + + // Allocate matrices in DAS region + shared_a = (float *)partition_malloc(das_alloc, a_size); + shared_b = (float *)partition_malloc(das_alloc, b_size); + shared_c = (float *)partition_malloc(das_alloc, c_size); + + // Configure DAS partitions + das_config(0, TILES_PER_PARTITION, (uint32_t)shared_a, a_size); + das_config(1, TILES_PER_PARTITION, (uint32_t)shared_b, b_size); + das_config(2, TILES_PER_PARTITION, (uint32_t)shared_c, c_size); + + // DMA: copy input matrices from L2 to DAS-allocated L1 + dma_memcpy_blocking(shared_a, l2_A, a_size); + dma_memcpy_blocking(shared_b, l2_B, b_size); + } + mempool_barrier(num_cores); + + // All cores read the shared pointers + float *matrix_a = shared_a; + float *matrix_b = shared_b; + float *matrix_c = shared_c; + + // Benchmark: parallel GEMM + mempool_start_benchmark(); + matmul_2x2_parallel_f32(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, + matrix_P, core_id, num_cores); + mempool_barrier(num_cores); + mempool_stop_benchmark(); + + // Verify against golden result + mempool_check_f32(matrix_c, l2_C, matrix_M * matrix_P, 0.01f, 0); + mempool_barrier(num_cores); + + // Cleanup + if (core_id == 0) { + alloc_t *das_alloc = get_dynamic_heap_alloc(); + partition_free(das_alloc, shared_c); + partition_free(das_alloc, shared_b); + partition_free(das_alloc, shared_a); + } + + return 0; +} diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py index 874cf1f43..3ef0d58de 100644 --- a/software/data/gendata_header.py +++ b/software/data/gendata_header.py @@ -188,6 +188,7 @@ def get_type(type_string): "matmul_f8": {"func": datalib.generate_fmatmul}, "matmul_f16": {"func": datalib.generate_fmatmul}, "matmul_f32": {"func": datalib.generate_fmatmul}, + "das_gemm_f32": {"func": datalib.generate_fmatmul}, "matmul_i32": {"func": datalib.generate_imatmul}, "matmul_i16": {"func": datalib.generate_imatmul}, "matmul_i8": {"func": datalib.generate_imatmul}, diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson index d7e9c851b..fc898019a 100644 --- a/software/data/gendata_params.hjson +++ b/software/data/gendata_params.hjson @@ -282,6 +282,20 @@ ] } + "das_gemm_f32": { + "type": "float32", + "defines": [ + ("matrix_M", 64) + ("matrix_N", 64) + ("matrix_P", 64) + ] + "arrays": [ + ("float", "l2_A") + ("float", "l2_B") + ("float", "l2_C") + ] + } + "matmul_i16": { "type": "int16", "defines": [ diff --git a/software/runtime/alloc.c b/software/runtime/alloc.c index 519bd8e32..e6d655dd8 100644 --- a/software/runtime/alloc.c +++ b/software/runtime/alloc.c @@ -34,6 +34,11 @@ alloc_t alloc_l1; // Allocators for L1 local sequential heap memory alloc_t alloc_tile[NUM_CORES / NUM_CORES_PER_TILE]; +// ---------------------------------------------------------------------------- +// Dynamic Heap Allocator +// ---------------------------------------------------------------------------- +alloc_t dynamic_heap_alloc; + // ---------------------------------------------------------------------------- // Canary System based on LSBs of block pointer // ---------------------------------------------------------------------------- @@ -55,6 +60,15 @@ static inline canary_and_size_t canary_decode(const uint32_t value) { return (canary_and_size_t){.canary = value & 0xFF, .size = value >> 8}; } +typedef struct canary_chain_s { + uint32_t canary_and_size; + uint32_t *data_address; + struct canary_chain_s *next_canary; +} canary_chain_t; + +// init as a NULL, assign this pointer when the first canary is allocated +canary_chain_t *first_canary = (canary_chain_t *)0x1000; + // ---------------------------------------------------------------------------- // Initialization // ---------------------------------------------------------------------------- @@ -116,6 +130,98 @@ static void *allocate_memory(alloc_t *alloc, const uint32_t size) { } } +// ------ Function to calculate the aligned size ------ // +static uint32_t calc_aligned_row_size(uint32_t *addr) { + + const uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + const uint32_t mask = (uint32_t)(row_bytes - 1); + uint32_t offset = ((uint32_t)addr) & mask; + + return (row_bytes - offset) & mask; +} + +// ------ Parameters ------ // +// size: Size of the data block need to be allocated +// allocated_size: How many rows the current partition scheme occupied +static void *allocate_memory_aligned(alloc_t *alloc, const uint32_t size) { + // Get first block of linked list of free blocks + alloc_block_t *curr = alloc->first_block; + alloc_block_t *prev = 0; + + // Search first block large enough in linked list + // 1. calculate the size aligned to the partition boundary + uint32_t shift_size = 0; + shift_size = calc_aligned_row_size((uint32_t *)curr); + uint32_t aligned_size = size + shift_size; + + while (curr && (curr->size < aligned_size)) { + prev = curr; + curr = curr->next; + shift_size = calc_aligned_row_size((uint32_t *)curr); + aligned_size = size + shift_size; + } + + if (curr) { + // Update allocator + if (size == aligned_size) { + // address is already aligned to the partition boundary + if (curr->size == size) { + // Special case: Whole block taken + if (prev) { + prev->next = curr->next; + } else { + alloc->first_block = curr->next; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = (alloc_block_t *)((char *)curr + size); + new_block->size = curr->size - size; + new_block->next = curr->next; + if (prev) { + prev->next = new_block; + } else { + alloc->first_block = new_block; + } + } + } else { + if (curr->size == aligned_size) { + // Special case: Whole block taken, first part of the block is still + // empty store the curr info in tmp uint32_t tmp_size = curr->size; + struct alloc_block_s *tmp_next = curr->next; + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = tmp_next; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } else { + // Regular case: Split off block + alloc_block_t *new_block = + (alloc_block_t *)((char *)curr + aligned_size); + new_block->size = curr->size - aligned_size; + new_block->next = curr->next; + + alloc_block_t *shift_block = (alloc_block_t *)((char *)curr); + shift_block->size = shift_size; + shift_block->next = new_block; + if (prev) { + prev->next = shift_block; + } else { + alloc->first_block = shift_block; + } + } + } + + // Return block pointer + return (void *)((char *)curr + shift_size); + } else { + // There is no free block large enough + return NULL; + } +} + void *domain_malloc(alloc_t *alloc, const uint32_t size) { // Calculate actually required block size uint32_t data_size = size + sizeof(uint32_t); // add size/metadata @@ -147,6 +253,81 @@ void *simple_malloc(const uint32_t size) { return domain_malloc(&alloc_l1, size); } +void *partition_malloc(alloc_t *alloc, const uint32_t size) { + + uint32_t data_size = size > 2 * NUM_BANKS * sizeof(uint32_t) + ? size + : 2 * NUM_BANKS * sizeof(uint32_t); + uint32_t block_size = ALIGN_UP(data_size, MIN_BLOCK_SIZE); // add alignment + + // Check if exceed maximum allowed size + if (block_size >= (1 << (sizeof(uint32_t) * 8 - sizeof(uint8_t) * 8))) { + printf("Memory allocator: Requested memory exceeds max block size\n"); + return NULL; + } + + // allocate + void *block_ptr = NULL; + block_ptr = allocate_memory_aligned(alloc, block_size); + + if (!block_ptr) { + printf("Memory allocator: No large enough block found (%d)\n", block_size); + return NULL; + } + + // Allocate a region in L1 heap for canary + canary_chain_t *canary = + (canary_chain_t *)simple_malloc(sizeof(canary_chain_t)); + // Init the canary + canary->data_address = (uint32_t *)block_ptr; + canary->canary_and_size = canary_encode(block_ptr, block_size); + canary->next_canary = NULL; + + // link the canary into the list + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + // Fit the canary into the chain, depending on data_address + // | prev | ------> | canary | ------> | curr | + uint32_t *data_addr = 0; + if (curr != (canary_chain_t *)0x1000) { + // only access struct when init + data_addr = curr->data_address; + } + + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + ((uint32_t *)data_addr < (uint32_t *)block_ptr)) { + prev = curr; + curr = curr->next_canary; + if (curr != NULL) { + data_addr = curr->data_address; + } + } + + if ((curr == (canary_chain_t *)0x1000) && !prev) { + // special case: first canary block + first_canary = canary; + } else { + if (!curr) { + // reach to the last of the chain + // | prev | ------> | canary | ------> NULL + prev->next_canary = canary; + canary->next_canary = NULL; + } else if (!prev) { + // canary need to insert at the beginning of the chain + // first_canary ------> | canary | ------> | curr | + first_canary = canary; + canary->next_canary = curr; + } else { + // normal case + // | prev | ------> | canary | ------> | curr | + canary->next_canary = prev->next_canary; + prev->next_canary = canary; + } + } + return block_ptr; +} + // ---------------------------------------------------------------------------- // Free Memory // ---------------------------------------------------------------------------- @@ -208,6 +389,72 @@ void domain_free(alloc_t *alloc, void *const ptr) { void simple_free(void *const ptr) { domain_free(&alloc_l1, ptr); } +void partition_free(alloc_t *alloc, void *const ptr) { + // block pointer is the input pointer + void *block_ptr = ptr; + + canary_and_size_t canary_and_size = + (canary_and_size_t){.canary = 0, .size = 0}; + // find the canary block in the chain + canary_chain_t *curr = first_canary; + canary_chain_t *prev = 0; + + // While loop suppose to stop when curr->data_address == block_ptr + // | prev | ------> | curr | + uint32_t *data_addr = 0; + if (curr) { + data_addr = curr->data_address; + } + while ((curr != (canary_chain_t *)0x1000) && (curr != NULL) && + (data_addr < (uint32_t *)block_ptr)) { + prev = curr; + curr = curr->next_canary; + if (curr != NULL) { + data_addr = curr->data_address; + } + } + + if ((curr == (canary_chain_t *)0x1000) && !prev) { + // nothing in the chain + printf("CANARY: Empty canary chain!\n"); + } else if (!curr) { + // reach to the end of the chain + printf("CANARY: Chain depleted. No info found for %p\n", block_ptr); + } else if (curr->data_address != block_ptr) { + // no information for the current free + printf("CANARY: Unmatch! %p - %p\n", curr->data_address, block_ptr); + } else if (!prev) { + // normal case 1: curr is the first canary + // first_canary ------> | curr | ------> next + canary_and_size = canary_decode(curr->canary_and_size); + if (curr->next_canary == NULL) { + first_canary = (canary_chain_t *)0x1000; + } else { + first_canary = curr->next_canary; + } + + simple_free((void *)curr); + } else { + // normal case 2: relink the chain, free the curr canary + // | prev | ------> | curr | ------> something + canary_and_size = canary_decode(curr->canary_and_size); + prev->next_canary = curr->next_canary; + simple_free((void *)curr); + } + + // Check for memory overflow + if (canary_and_size.canary != canary(block_ptr)) { + if (!canary_and_size.canary) { + printf("Empty canary.\n"); + } + printf("Memory Overflow at %p\n", block_ptr); + return; + } + + // Free memory + free_memory(alloc, block_ptr, canary_and_size.size); +} + // ---------------------------------------------------------------------------- // Debugging Functions // ---------------------------------------------------------------------------- @@ -233,9 +480,31 @@ void alloc_dump(alloc_t *alloc) { } } +void canary_dump(void) { + printf(" ------ Canary Chain Dump ------ \n"); + canary_chain_t *curr = first_canary; + if (curr == (canary_chain_t *)0x1000) { + // empty list + printf("Empty Canary list.\n"); + } else { + uint32_t cnt = 0; + while (curr != NULL) { + printf("[%d] - [%p] - [%p] - [%p]\n", cnt, curr, curr->data_address, + curr->next_canary); + cnt += 1; + curr = curr->next_canary; + } + } + printf(" ------ Canary Dump END ------ \n"); +} + // ---------------------------------------------------------------------------- // Get Allocators // ---------------------------------------------------------------------------- +// Get the address of global variable `alloc_l1` alloc_t *get_alloc_l1() { return &alloc_l1; } alloc_t *get_alloc_tile(const uint32_t tile_id) { return &alloc_tile[tile_id]; } + +// Dynamic Heap Allocator +alloc_t *get_dynamic_heap_alloc() { return &dynamic_heap_alloc; } diff --git a/software/runtime/alloc.h b/software/runtime/alloc.h index f6db489a2..453218228 100644 --- a/software/runtime/alloc.h +++ b/software/runtime/alloc.h @@ -44,6 +44,7 @@ void domain_free(alloc_t *alloc, void *const ptr); // Print out linked list of free blocks void alloc_dump(alloc_t *alloc); +void canary_dump(void); // Get allocator for L1 interleaved heap memory alloc_t *get_alloc_l1(); @@ -51,4 +52,13 @@ alloc_t *get_alloc_l1(); // Get allocator for L1 local sequential heap memory alloc_t *get_alloc_tile(const uint32_t tile_id); +// ----- Dynamic Heap Allocator ----- // +alloc_t *get_dynamic_heap_alloc(); + +// Dynamic heap allocation with Canary Chain +void *partition_malloc(alloc_t *alloc, const uint32_t size); + +// Free dynamic heap allocation with Canary chain +void partition_free(alloc_t *alloc, void *const ptr); + #endif diff --git a/software/runtime/arch.ld.c b/software/runtime/arch.ld.c index 1d8de5e57..6fb38c996 100644 --- a/software/runtime/arch.ld.c +++ b/software/runtime/arch.ld.c @@ -31,5 +31,12 @@ SECTIONS { __heap_start = __l1_start; __heap_end = __l1_end; + // DAS related, default impacted region size +#ifdef DAS_MEM_SIZE + __heap_seq_start = __l1_start + (NUM_CORES * BANKING_FACTOR * L1_BANK_SIZE) - NUM_CORES * DAS_MEM_SIZE; +#else + __heap_seq_start = __l1_end; +#endif + fake_uart = 0xC0000000; } diff --git a/software/runtime/control_registers.h b/software/runtime/control_registers.h index 541825143..69f0ac426 100644 --- a/software/runtime/control_registers.h +++ b/software/runtime/control_registers.h @@ -19,6 +19,9 @@ extern "C" { // Maximum number of groups that we support in any configuration #define CONTROL_REGISTERS_PARAM_MAX_NUMGROUPS 8 +// Supported number of DAS partitions +#define CONTROL_REGISTERS_PARAM_NUM_DAS_PARTITIONS 4 + // Register width #define CONTROL_REGISTERS_PARAM_REG_WIDTH 32 @@ -115,6 +118,58 @@ extern "C" { // Read-only cache Region End #define CONTROL_REGISTERS_RO_CACHE_END_3_REG_OFFSET 0x64 +// DAS per region folding granularity. Number of Tiles per DAS partition. +// (common parameters) +#define CONTROL_REGISTERS_TILES_DAS_TILES_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_TILES_DAS_TILES_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_TILES_DAS_MULTIREG_COUNT 4 + +// DAS per region folding granularity. Number of Tiles per DAS partition. +#define CONTROL_REGISTERS_TILES_DAS_0_REG_OFFSET 0x68 + +// DAS per region folding granularity. Number of Tiles per DAS partition. +#define CONTROL_REGISTERS_TILES_DAS_1_REG_OFFSET 0x6c + +// DAS per region folding granularity. Number of Tiles per DAS partition. +#define CONTROL_REGISTERS_TILES_DAS_2_REG_OFFSET 0x70 + +// DAS per region folding granularity. Number of Tiles per DAS partition. +#define CONTROL_REGISTERS_TILES_DAS_3_REG_OFFSET 0x74 + +// DAS per region allocated start address. (common parameters) +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_START_DAS_START_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_START_DAS_MULTIREG_COUNT 4 + +// DAS per region allocated start address. +#define CONTROL_REGISTERS_START_DAS_0_REG_OFFSET 0x78 + +// DAS per region allocated start address. +#define CONTROL_REGISTERS_START_DAS_1_REG_OFFSET 0x7c + +// DAS per region allocated start address. +#define CONTROL_REGISTERS_START_DAS_2_REG_OFFSET 0x80 + +// DAS per region allocated start address. +#define CONTROL_REGISTERS_START_DAS_3_REG_OFFSET 0x84 + +// DAS per region allocated size. (common parameters) +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELD_WIDTH 32 +#define CONTROL_REGISTERS_ROWS_DAS_ROWS_DAS_FIELDS_PER_REG 1 +#define CONTROL_REGISTERS_ROWS_DAS_MULTIREG_COUNT 4 + +// DAS per region allocated size. +#define CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET 0x88 + +// DAS per region allocated size. +#define CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET 0x8c + +// DAS per region allocated size. +#define CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET 0x90 + +// DAS per region allocated size. +#define CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET 0x94 + #ifdef __cplusplus } // extern "C" #endif diff --git a/software/runtime/dma.h b/software/runtime/dma.h index 4aa7f6cec..cab318d28 100644 --- a/software/runtime/dma.h +++ b/software/runtime/dma.h @@ -73,4 +73,5 @@ void dma_memcpy_blocking(void *dest, const void *src, size_t len) { dma_memcpy_nonblocking(dest, src, len); dma_wait(); } + #endif // _DMA_H_ diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 7ec0aa8d2..59744f066 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -54,6 +54,49 @@ static uint32_t volatile *wake_up_offset_reg = (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + CONTROL_REGISTERS_WAKE_UP_OFFST_REG_OFFSET); +#ifdef NUM_DAS_PARTITIONS +/* DAS-related regs */ + +static uint32_t volatile *tiles_das_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_TILES_DAS_0_REG_OFFSET); +static uint32_t volatile *tiles_das_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_TILES_DAS_1_REG_OFFSET); +static uint32_t volatile *tiles_das_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_TILES_DAS_2_REG_OFFSET); +static uint32_t volatile *tiles_das_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_TILES_DAS_3_REG_OFFSET); + +static uint32_t volatile *start_das_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_0_REG_OFFSET); +static uint32_t volatile *start_das_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_1_REG_OFFSET); +static uint32_t volatile *start_das_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_2_REG_OFFSET); +static uint32_t volatile *start_das_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_START_DAS_3_REG_OFFSET); + +static uint32_t volatile *rows_das_0_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_0_REG_OFFSET); +static uint32_t volatile *rows_das_1_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_1_REG_OFFSET); +static uint32_t volatile *rows_das_2_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_2_REG_OFFSET); +static uint32_t volatile *rows_das_3_reg = + (uint32_t volatile *)(CONTROL_REGISTER_OFFSET + + CONTROL_REGISTERS_ROWS_DAS_3_REG_OFFSET); +#endif /* NUM_DAS_PARTITIONS */ + typedef uint32_t mempool_id_t; typedef uint32_t mempool_timer_t; @@ -99,8 +142,12 @@ static inline uint32_t mempool_get_core_count_per_group() { static inline void mempool_init(const uint32_t core_id) { if (core_id == 0) { // Initialize L1 Interleaved Heap Allocator - extern uint32_t __heap_start, __heap_end; - uint32_t heap_size = (uint32_t)&__heap_end - (uint32_t)&__heap_start; + extern uint32_t __heap_start; + extern uint32_t __heap_seq_start; + // Heap Region + uint32_t heap_size = + (uint32_t)&__heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size alloc_init(get_alloc_l1(), &__heap_start, heap_size); // Initialize L1 Sequential Heap Allocator per Tile @@ -123,6 +170,52 @@ static inline void mempool_init(const uint32_t core_id) { } } +// Reconfigure Interleaved Heap region, with explicit 'Dynamic Heap' start +// address Programmer API for flexible Dynamic Heap region configuration +static inline void mempool_reset_heap(const uint32_t core_id, + uint32_t heap_seq_start) { + if (core_id == 0) { + // Initialize L1 Interleaved Heap Allocator + extern uint32_t __heap_start; + uint32_t heap_size = + (uint32_t)heap_seq_start - + (uint32_t)&__heap_start; // Downscale interleaved heap size + alloc_init(get_alloc_l1(), &__heap_start, heap_size); + } +} + +#ifdef DAS_MEM_SIZE +// Initialize Dynamic Heap Allocator, as default specified in the linker script +static inline void mempool_dynamic_heap_alloc_init(const uint32_t core_id) { + if (core_id == 0) { + extern uint32_t __heap_seq_start; + // Dynamic allocator base and size + uint32_t seq_heap_base = (uint32_t)&__heap_seq_start; + uint32_t seq_heap_size = NUM_CORES * DAS_MEM_SIZE; + // Dynamically allocate the space for allocators + alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); + } +} +#endif /* DAS_MEM_SIZE */ + +// Reset Dynamic Heap region with explicit start address specification +// A UNIFIED allocator will be used +static inline void mempool_dynamic_heap_alloc_reset(const uint32_t core_id, + uint32_t heap_seq_start) { + if (core_id == 0) { + extern uint32_t __heap_end; + // Dynamic allocator base and size + uint32_t seq_heap_base = heap_seq_start; + uint32_t seq_heap_size = (uint32_t)&__heap_end - heap_seq_start; + // Reset the space for allocators + alloc_t *dynamic_heap_allocator = get_dynamic_heap_alloc(); + alloc_init(dynamic_heap_allocator, (uint32_t *)seq_heap_base, + seq_heap_size); + } +} + /// Reset a monotonically increasing cycle count. static inline void mempool_start_benchmark() { asm volatile("" ::: "memory"); @@ -200,6 +293,53 @@ static inline void set_wake_up_stride(uint32_t stride) { static inline void set_wake_up_offset(uint32_t offset) { *wake_up_offset_reg = offset; } + +#ifdef NUM_DAS_PARTITIONS +// Partition Configuration +static inline void das_config(uint32_t reg_sel, uint32_t tiles_per_partition, + uint32_t addr, uint32_t size) { + asm volatile("" ::: "memory"); + // Compute number of rows + uint32_t row_bytes = NUM_BANKS * sizeof(uint32_t); + uint32_t rows_das = (size + (row_bytes - 1)) / row_bytes; + + // enforce minimum 2 rows per partition + // TODO (bowwang): should add protection to enforce `rows_das` is power of 2 + if (rows_das < 2) + rows_das = 2; + + // Program DAS registers + switch (reg_sel) { + case 0: + *tiles_das_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; + break; + case 1: + *tiles_das_1_reg = tiles_per_partition; + *start_das_1_reg = addr; + *rows_das_1_reg = rows_das; + break; + case 2: + *tiles_das_2_reg = tiles_per_partition; + *start_das_2_reg = addr; + *rows_das_2_reg = rows_das; + break; + case 3: + *tiles_das_3_reg = tiles_per_partition; + *start_das_3_reg = addr; + *rows_das_3_reg = rows_das; + break; + default: + *tiles_das_0_reg = tiles_per_partition; + *start_das_0_reg = addr; + *rows_das_0_reg = rows_das; + break; + } + asm volatile("" ::: "memory"); +} +#endif /* NUM_DAS_PARTITIONS */ + // Dump a value via CSR // This is only supported in simulation and an experimental feature. All writes // to unimplemented CSR registers will be dumped by Snitch. This can be diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 94f822ddc..039473a6e 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -89,6 +89,7 @@ RISCV_STRIP ?= $(RISCV_PREFIX)strip # Defines DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DPRINTF_DISABLE_SUPPORT_PTRDIFF_T DEFINES += -DNUM_CORES=$(num_cores) +DEFINES += -DLOG2_NUM_CORES=$(shell awk 'BEGIN{print log($(num_cores))/log(2)}') DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) DEFINES += -DBANKING_FACTOR=$(banking_factor) @@ -110,6 +111,11 @@ ifdef terapool DEFINES += -DNUM_CORES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_sub_groups_per_group)}') DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}') endif +ifdef das + DEFINES += -DNUM_DAS_PARTITIONS=$(num_das_partitions) + DEFINES += -DDAS_MEM_SIZE=$(das_mem_size) + DEFINES += -DLOG2_DAS_MEM_SIZE=$(shell awk 'BEGIN{print log($(das_mem_size))/log(2)}') +endif # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target RISCV_LLVM_TARGET ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR) diff --git a/software/runtime/synchronization.c b/software/runtime/synchronization.c index c3c3846f8..93fac8321 100644 --- a/software/runtime/synchronization.c +++ b/software/runtime/synchronization.c @@ -10,14 +10,6 @@ #include "runtime.h" #include "synchronization.h" -#if NUM_CORES == (16) -#define LOG2_NUM_CORES (4) -#elif NUM_CORES == (256) -#define LOG2_NUM_CORES (8) -#elif NUM_CORES == (1024) -#define LOG2_NUM_CORES (10) -#endif - uint32_t volatile barrier __attribute__((section(".l1"))); uint32_t volatile log_barrier[NUM_CORES * 4] __attribute__((aligned(NUM_CORES * 4), section(".l1"))); diff --git a/software/tests/baremetal/Makefile b/software/tests/baremetal/Makefile index 53af85328..8c464ad77 100644 --- a/software/tests/baremetal/Makefile +++ b/software/tests/baremetal/Makefile @@ -16,6 +16,9 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) include $(RUNTIME_DIR)/runtime.mk TESTS := $(patsubst $(TESTS_DIR)/%/main.c,%,$(shell find $(TESTS_DIR) -name "main.c")) +ifndef das + TESTS := $(filter-out das_%,$(TESTS)) +endif BINARIES := $(addprefix $(BIN_DIR)/,$(TESTS)) # Make all applications @@ -41,4 +44,4 @@ clean: rm -vf $(RUNTIME) rm -vf $(LINKER_SCRIPT) -.INTERMEDIATE: $(addsuffix /main.c.o,$(TESTS)) +.INTERMEDIATE: $(addsuffix /main.c.o,$(TESTS)) \ No newline at end of file diff --git a/software/tests/baremetal/das_dma/main.c b/software/tests/baremetal/das_dma/main.c new file mode 100644 index 000000000..99460f74e --- /dev/null +++ b/software/tests/baremetal/das_dma/main.c @@ -0,0 +1,102 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +#define NUM_PARTITION_ROWS (2) +uint32_t l2_array[NUM_PARTITION_ROWS * NUM_BANKS] + __attribute__((section(".l2"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Initialize + // -------------------------------------------- + uint32_t num_tiles_per_partition = 32; + uint32_t array_size = NUM_PARTITION_ROWS * NUM_BANKS; + // Initialize L2 array + for (uint32_t i = 0; i < array_size; i++) { + l2_array[i] = i; + } + + // -------------------------------------------- + // Verify DMA transfers in DAS region + // -------------------------------------------- + printf("Verify DMA transfers in DAS region\n\n"); + + // 1. Init dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + + // 2. Set which partition write to. + uint32_t part_id = 0; + + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), + array_size * sizeof(uint32_t)); + + // 6. Move data + dma_memcpy_blocking(array, l2_array, array_size * sizeof(uint32_t)); + + printf("%4d at address %8X.\n", array[0], &array[0]); + + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), + array_size * sizeof(uint32_t)); + + printf("%4d at address %8X.\n", array[0], &array[0]); + + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t partition_width = + num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR; + uint32_t ele_offset = i % partition_width; + uint32_t row_offset = (i / partition_width) % NUM_PARTITION_ROWS; + uint32_t partition_offset = (i / (partition_width * NUM_PARTITION_ROWS)); + uint32_t *fetch_address = &array[0] + ele_offset + + row_offset * NUM_BANKS + + partition_offset * partition_width; + if (l2_array[i] != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + } + } + + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + + printf("All correct!\n"); + } + + mempool_barrier(num_cores); + return 0; +} diff --git a/software/tests/baremetal/das_malloc_test/main.c b/software/tests/baremetal/das_malloc_test/main.c new file mode 100644 index 000000000..8f564c549 --- /dev/null +++ b/software/tests/baremetal/das_malloc_test/main.c @@ -0,0 +1,190 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + // Initialization + mempool_init(core_id); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 4; + uint32_t array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + + // 1. Init dynamic heap allocator + mempool_dynamic_heap_alloc_init(core_id); + + // 2. Set which partition write to. + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("start_addr at 0x%8x\n", array); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS on partition %d \n\n", part_id); + } + + // -------------------------------------------- + // Verify DAS partitions with misalignment + // -------------------------------------------- + printf("Verify DAS partitions with misalignemnt\n\n"); + + // 2. Set which partition write to. + for (uint32_t part_id = 0; part_id < NUM_DAS_PARTITIONS; part_id++) { + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + + // 4.0 inject misalignment + uint32_t offset = 32 * (1 + part_id); + uint32_t *misalign = (uint32_t *)partition_malloc( + dynamic_heap_alloc, (2 * NUM_BANKS + offset) * sizeof(uint32_t)); + printf("Inject misalignment at 0x%8x with size 0x%8x in byte\n", misalign, + offset * part_id); + + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + printf("Aligned start_addr at 0x%8x\n", array); + + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + partition_free(dynamic_heap_alloc, misalign); + printf("SUCCESS on partition %d \n\n", part_id); + } + + // -------------------------------------------- + // Verify DAS per Tile groups + // -------------------------------------------- + printf("Verify DAS per Tile-groups\n\n"); + + // 2. Set which partition write to. + uint32_t part_id = 0; + for (num_tiles_per_partition = 1; num_tiles_per_partition < NUM_TILES; + num_tiles_per_partition *= 2) { + array_size = + 2 * num_tiles_per_partition * BANKING_FACTOR * NUM_CORES_PER_TILE; + // 3. Get the allocator + alloc_t *dynamic_heap_alloc = get_dynamic_heap_alloc(); + alloc_dump(dynamic_heap_alloc); + // 4. Allocate memory + uint32_t *array = (uint32_t *)partition_malloc( + dynamic_heap_alloc, array_size * sizeof(uint32_t)); + // 5. Config the hardware registers + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // 6. Move data + for (uint32_t i = 0; i < array_size; i++) { + array[i] = i; + } + // 7. Change addressing scheme (to fully interleaved) + das_config(part_id, NUM_TILES, (uint32_t)(array), + array_size * sizeof(uint32_t)); + // partition_config(part_id, NUM_TILES); + // 8. check + for (uint32_t i = 0; i < array_size; i++) { + uint32_t *fetch_address = + &array[0] + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i, *fetch_address, + fetch_address); + return 1; + } + } + // 9. Free array + partition_free(dynamic_heap_alloc, array); + printf("SUCCESS for groups of %d tiles over the partition \n\n", + num_tiles_per_partition); + } + + printf("All correct!\n"); + } + + mempool_barrier(num_cores); + return 0; +} diff --git a/software/tests/baremetal/das_static_test/main.c b/software/tests/baremetal/das_static_test/main.c new file mode 100644 index 000000000..39432a353 --- /dev/null +++ b/software/tests/baremetal/das_static_test/main.c @@ -0,0 +1,73 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Note: This test is only for Terapool dynamic heap allocation +// Author: Bowen Wang + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define NUM_TILES (NUM_CORES / NUM_CORES_PER_TILE) +#define ARRAY_SIZE (4096) + +uint32_t array[ARRAY_SIZE] + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + if (core_id == 0) { + + // -------------------------------------------- + // Verify DAS partitions + // -------------------------------------------- + printf("Verify DAS partitions\n\n"); + + uint32_t num_tiles_per_partition = 64; + uint32_t part_id = 0; + + uint32_t num_partitions = NUM_TILES / num_tiles_per_partition; + uint32_t size_partition = ARRAY_SIZE / num_partitions; + + das_config(part_id, num_tiles_per_partition, (uint32_t)(array), + ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t i = 0; i < ARRAY_SIZE; i++) { + array[i] = i; + } + + das_config(part_id, NUM_TILES, (uint32_t)(array), + ARRAY_SIZE * sizeof(uint32_t)); + for (uint32_t j = 0; j < num_partitions; j++) { + for (uint32_t i = 0; i < size_partition; i++) { + + uint32_t *fetch_address = + &array[0] + + j * (num_tiles_per_partition * NUM_CORES_PER_TILE * + BANKING_FACTOR) + + (i % + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) + + (i / + (num_tiles_per_partition * NUM_CORES_PER_TILE * BANKING_FACTOR)) * + NUM_BANKS; + if (i + j * size_partition != *fetch_address) { + printf("%4d != %4d at address %8X.\n", i + j * size_partition, + *fetch_address, fetch_address); + return 1; + } + } + } + printf("SUCCESS on partition %d\n", part_id); + } + + mempool_barrier(num_cores); + return 0; +}