-
Notifications
You must be signed in to change notification settings - Fork 61
[DRAFT] Dynamic allocation scheme #126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 49 commits
fb21be6
1989369
2b5cc29
320854a
afad72f
4087bf4
6cfabd8
bd0386b
6ef5845
9199936
c155c4d
a92a71f
833ae78
699eb47
bbf4280
56654a5
9c02172
d26b44a
92242b7
543027b
473bd0e
737d106
ee0ff54
58fdcd4
cb090bf
1982a12
12cdd27
50f4326
6929c05
595e005
4b65f4e
3090712
2f2521a
f52591e
221d00f
64003ac
2f39f8f
e19b90d
0a7a90c
c6a8fd8
5c11838
13f9335
d738153
47a0425
9777579
6252225
b5b6c7d
997392e
e982df0
310d75c
2423007
b7cc499
7504598
c2bac05
0cde68d
c7163dc
7a000bc
bd5e0a7
e51b268
40d2db2
e1fd77d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,4 +73,4 @@ sources: | |
| - target: fpga | ||
| files: | ||
| # Level 1 | ||
| - hardware/src/axi_rab_wrap.sv | ||
| - hardware/src/axi_rab_wrap.sv | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,10 @@ | ||
| [](https://github.com/pulp-platform/mempool/actions/workflows/ci.yml) | ||
| [](https://github.com/pulp-platform/mempool/actions/workflows/lint.yml) | ||
| [](https://opensource.org/licenses/Apache-2.0) | ||
| # MemPool Dynamic Allocation Scheme | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add this to the bottom of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part has been moved to config/README.md |
||
| Dynamic Allocation Scheme (DAS), a flexible, adaptable, runtime-configurable address mapping technique. DAS remaps contiguous address spaces to physically adjacent memory banks based on the workload’s memory access patterns, placing the data physically close to PEs. | ||
|
|
||
| This repository branch contains DAS extensions based on MemPool. | ||
|
|
||
| # MemPool | ||
|
|
||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can the address scrambler of the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, i have aligned the naming accordingly. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| // Copyright 2021 ETH Zurich and University of Bologna. | ||
| // Solderpad Hardware License, Version 0.51, see LICENSE for details. | ||
| // SPDX-License-Identifier: SHL-0.51 | ||
|
|
||
| // Description: Address scrambler for iDMA Midend, scramble scheme is determined | ||
| // by group_factor | ||
| // Current constraints: | ||
|
|
||
| // Author: Bowen Wang <bowwang@student.ethz.ch> | ||
| // Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch> | ||
|
|
||
| module idma_address_scrambler #( | ||
| parameter int unsigned AddrWidth = 32, | ||
| parameter int unsigned DataWidth = 32, | ||
| parameter int unsigned ByteOffset = 2, | ||
| parameter bit Bypass = 0, | ||
| parameter int unsigned NumTiles = 128, | ||
| parameter int unsigned NumBanksPerTile = 32, | ||
| parameter int unsigned TCDMSizePerBank = 1024, | ||
| parameter int unsigned NumDASPartitions = 4, | ||
| parameter int unsigned DASStartAddr = 1024, | ||
| parameter int unsigned MemSizePerTile = NumBanksPerTile*TCDMSizePerBank, | ||
| parameter int unsigned MemSizePerRow = (1 << ByteOffset)*NumBanksPerTile*NumTiles | ||
| ) ( | ||
| input logic [AddrWidth-1:0] address_i, | ||
| input logic [31:0] num_bytes_i, | ||
| input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] group_factor_i, | ||
| input logic [NumDASPartitions-1:0][$clog2(NumTiles):0] allocated_size_i, | ||
| input logic [NumDASPartitions-1:0][DataWidth-1:0] start_addr_scheme_i, | ||
| output logic [$clog2(NumTiles):0] group_factor_o, | ||
| output logic [$clog2(NumTiles):0] allocated_size_o, | ||
| output logic [AddrWidth-1:0] address_o | ||
| ); | ||
| // Basic Settings | ||
| localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); | ||
| localparam int unsigned TileIdBits = $clog2(NumTiles); | ||
| localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; | ||
|
|
||
| if (Bypass || NumTiles < 2) begin | ||
| assign address_o = address_i; | ||
| end else begin | ||
|
|
||
| // ------ Heap Sequential Signals ------ // | ||
|
|
||
| // `tile_index` : how many bits to shift for TileID bits in each partition | ||
| // `row_index`: how many bits need to swap within Row Index | ||
| logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] tile_index; | ||
| logic [NumDASPartitions-1:0][$clog2($clog2(NumTiles)+1)-1:0] row_index; | ||
|
|
||
| for (genvar i = 0; i < NumDASPartitions; i++) begin : gen_shift_index | ||
| lzc #( | ||
| .WIDTH ($clog2(NumTiles)+1), | ||
| .MODE (1'b0 ) | ||
| ) i_log_tile_index ( | ||
| .in_i (group_factor_i[i]), | ||
| .cnt_o (tile_index[i] ), | ||
| .empty_o (/* Unused */ ) | ||
| ); | ||
| lzc #( | ||
| .WIDTH ($clog2(NumTiles)+1), | ||
| .MODE (1'b0 ) | ||
| ) i_log_row_index ( | ||
| .in_i (allocated_size_i[i][$clog2(NumTiles):0]), | ||
| .cnt_o (row_index[i] ), | ||
| .empty_o (/* Unused */ ) | ||
| ); | ||
| end | ||
|
|
||
| always_comb begin | ||
|
|
||
| // Default: Unscrambled | ||
| address_o = address_i; | ||
| group_factor_o = '0; | ||
| allocated_size_o = '0; | ||
|
|
||
| // TODO (bowwang): add a new register to indicate the start addr of sequential heap region, currently hard coded | ||
| if (address_i < DASStartAddr) begin | ||
| group_factor_o = NumTiles; // fully interleaved | ||
| allocated_size_o = num_bytes_i / MemSizePerRow; | ||
|
|
||
| // DAS address scrambling | ||
| end else begin | ||
|
|
||
| for (int p = 0; p < NumDASPartitions; p++) begin | ||
| if ( (address_i >= start_addr_scheme_i[p]) && (address_i < start_addr_scheme_i[p]+MemSizePerRow*allocated_size_i[p]) ) begin | ||
| address_o = '0; | ||
| address_o |= address_i & ((1 << (tile_index[p]+ConstantBitsLSB)) - 1); | ||
| address_o |= ((address_i >> (row_index[p]+tile_index[p]+ConstantBitsLSB)) << (tile_index[p]+ConstantBitsLSB)) & ((1 << (TileIdBits+ConstantBitsLSB)) - 1); | ||
| address_o |= ((address_i >> (tile_index[p]+ConstantBitsLSB)) << (TileIdBits + ConstantBitsLSB)) & ((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); | ||
| address_o |= address_i & ~((1 << (row_index[p]+TileIdBits+ConstantBitsLSB)) - 1); | ||
| group_factor_o = group_factor_i[p]; | ||
| allocated_size_o = allocated_size_i[p]; | ||
| end | ||
| end | ||
|
|
||
| end | ||
| end | ||
|
|
||
| end | ||
|
|
||
| // Check for unsupported configurations | ||
| if (NumBanksPerTile < 2) | ||
| $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); | ||
|
|
||
| endmodule : idma_address_scrambler |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,8 @@ | |
| // SPDX-License-Identifier: SHL-0.51 | ||
|
|
||
| // Samuel Riedel <sriedel@iis.ee.ethz.ch> | ||
| // Bowen Wang <bowwang@student.ethz.ch> | ||
| // Marco Bertuletti <mbertuletti@iis.ee.ethz.ch> | ||
|
|
||
| `include "common_cells/registers.svh" | ||
|
|
||
|
|
@@ -17,23 +19,31 @@ module idma_distributed_midend #( | |
| parameter int unsigned DmaRegionEnd = 32'h1000_0000, | ||
| /// Number of generic 1D requests that can be buffered | ||
| parameter int unsigned TransFifoDepth = 1, | ||
| `ifdef DAS | ||
| parameter int unsigned NumTiles = 64, | ||
| parameter int unsigned NumDASPartitions = 4, | ||
| `endif | ||
| /// Arbitrary 1D burst request definition | ||
| parameter type burst_req_t = logic, | ||
| /// Meta data response definition | ||
| parameter type meta_t = logic | ||
| ) ( | ||
| input logic clk_i, | ||
| input logic rst_ni, | ||
| input logic clk_i, | ||
| input logic rst_ni, | ||
| `ifdef DAS | ||
| // DAS signals | ||
| input logic [$clog2(NumTiles):0] rows_das_i, | ||
| `endif | ||
| // Slave | ||
| input burst_req_t burst_req_i, | ||
| input logic valid_i, | ||
| output logic ready_o, | ||
| output meta_t meta_o, | ||
| input burst_req_t burst_req_i, | ||
| input logic valid_i, | ||
| output logic ready_o, | ||
| output meta_t meta_o, | ||
| // Master | ||
| output burst_req_t [NoMstPorts-1:0] burst_req_o, | ||
| output logic [NoMstPorts-1:0] valid_o, | ||
| input logic [NoMstPorts-1:0] ready_i, | ||
| input meta_t [NoMstPorts-1:0] meta_i | ||
| output burst_req_t [NoMstPorts-1:0] burst_req_o, | ||
| output logic [NoMstPorts-1:0] valid_o, | ||
| input logic [NoMstPorts-1:0] ready_i, | ||
| input meta_t [NoMstPorts-1:0] meta_i | ||
| ); | ||
|
|
||
| localparam DmaRegionAddressBits = $clog2(DmaRegionWidth); | ||
|
|
@@ -57,6 +67,7 @@ module idma_distributed_midend #( | |
| // Collect the `trans_complete` signals and reduce them once we have all of them | ||
| logic empty; | ||
| logic data; | ||
| logic push; | ||
| fifo_v3 #( | ||
| .FALL_THROUGH (0 ), | ||
| .DATA_WIDTH (1 ), | ||
|
|
@@ -70,12 +81,44 @@ module idma_distributed_midend #( | |
| .empty_o (empty ), | ||
| .usage_o (/*unused*/ ), | ||
| .data_i (1'b1 ), | ||
| .push_i (trans_complete_d[i] ), | ||
| .push_i (push ), | ||
| .data_o (data ), | ||
| .pop_i (meta_o.trans_complete) | ||
| ); | ||
| assign trans_complete_d[i] = meta_i[i].trans_complete | tie_off_trans_complete_q[i]; | ||
| assign trans_complete_q[i] = data && !empty; | ||
|
|
||
| `ifdef DAS | ||
| // Handle two complete signals arrive at the same time | ||
| logic [NumDASPartitions-1:0] conflict_counter_d, conflict_counter_q; | ||
| `FF(conflict_counter_q, conflict_counter_d, '0, clk_i, rst_ni) | ||
| always_comb begin | ||
| push = trans_complete_d[i] && !fifo_full[i]; | ||
| conflict_counter_d = conflict_counter_q; | ||
| // FIFO is not full | ||
| if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && !fifo_full[i]) begin | ||
| conflict_counter_d = conflict_counter_q+1; | ||
| end | ||
| // FIFO is full | ||
| if (meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin | ||
| conflict_counter_d = conflict_counter_q+2; | ||
| end | ||
| if (!meta_i[i].trans_complete && tie_off_trans_complete_q[i] && fifo_full[i]) begin | ||
| conflict_counter_d = conflict_counter_q+1; | ||
| end | ||
| if (meta_i[i].trans_complete && !tie_off_trans_complete_q[i] && fifo_full[i]) begin | ||
| conflict_counter_d = conflict_counter_q+1; | ||
| end | ||
| // FIFO is not full, safe to push | ||
| if (|conflict_counter_q && !trans_complete_d[i] && !fifo_full[i] ) begin | ||
| push = 1'b1; | ||
| conflict_counter_d = conflict_counter_q-1; | ||
| end | ||
| end | ||
| `else | ||
| assign push = trans_complete_d[i]; | ||
| `endif | ||
|
|
||
| end | ||
|
|
||
| always_comb begin | ||
|
|
@@ -106,6 +149,7 @@ module idma_distributed_midend #( | |
| assign dst_addr = burst_req_i.dst[FullRegionAddressBits-1:0]; | ||
|
|
||
| always_comb begin | ||
|
|
||
| if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin | ||
| start_addr = src_addr; | ||
| end else begin | ||
|
|
@@ -126,6 +170,23 @@ module idma_distributed_midend #( | |
| burst_req_o[i].dst = burst_req_i.dst; | ||
| // Modify lower addresses bits and size | ||
| if (($unsigned(start_addr) >= (i+1)*DmaRegionWidth) || ($unsigned(end_addr) <= i*DmaRegionWidth)) begin | ||
| `ifdef DAS | ||
| burst_req_o[i].num_bytes = (burst_req_i.num_bytes<DmaRegionWidth) ? burst_req_i.num_bytes : DmaRegionWidth; | ||
| if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin | ||
| burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth; | ||
| burst_req_o[i].dst = burst_req_i.dst+i*rows_das_i*DmaRegionWidth; | ||
| end else begin | ||
| // L2 --> L1 | ||
| if (burst_req_i.num_bytes<=DmaRegionWidth )begin | ||
| burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; | ||
| end else if (i==2) begin | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to change |
||
| burst_req_o[i].src = burst_req_i.src+i*rows_das_i*DmaRegionWidth; | ||
| end else if (i==3) begin | ||
| burst_req_o[i].src = burst_req_i.src+(i-1)*rows_das_i*DmaRegionWidth + DmaRegionWidth; | ||
| end | ||
| burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth; | ||
| end | ||
| `else | ||
| // We are not involved in the transfer | ||
| burst_req_o[i].src = '0; | ||
| burst_req_o[i].dst = '0; | ||
|
|
@@ -137,6 +198,7 @@ module idma_distributed_midend #( | |
| if (valid[i]) begin | ||
| tie_off_trans_complete_d[i] = 1'b1; | ||
| end | ||
| `endif | ||
| end else if (($unsigned(start_addr) >= i*DmaRegionWidth)) begin | ||
| // First (and potentially only) slice | ||
| // Leave address as is | ||
|
|
@@ -146,6 +208,16 @@ module idma_distributed_midend #( | |
| burst_req_o[i].num_bytes = DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; | ||
| end | ||
| end else begin | ||
| `ifdef DAS | ||
| // Round up the address to the next DMA boundary | ||
| if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin | ||
| burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; | ||
| burst_req_o[i].dst = burst_req_i.dst+i*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; | ||
| end else begin | ||
| burst_req_o[i].src = burst_req_i.src+(i-start_addr[DmaRegionAddressBits+1:DmaRegionAddressBits])*DmaRegionWidth-start_addr[DmaRegionAddressBits-1:0]; | ||
| burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; | ||
| end | ||
| `else | ||
| // Round up the address to the next DMA boundary | ||
| if (($unsigned(burst_req_i.src) >= DmaRegionStart) && ($unsigned(burst_req_i.src) < DmaRegionEnd)) begin | ||
| burst_req_o[i].src[FullRegionAddressBits-1:0] = i*DmaRegionWidth; | ||
|
|
@@ -154,6 +226,7 @@ module idma_distributed_midend #( | |
| burst_req_o[i].src = burst_req_i.src+i*DmaRegionWidth-start_addr; | ||
| burst_req_o[i].dst[FullRegionAddressBits-1:0] = i*DmaRegionWidth; | ||
| end | ||
| `endif | ||
| if ($unsigned(end_addr) >= (i+1)*DmaRegionWidth) begin | ||
| // Middle slice | ||
| // Emit a full-sized transfer | ||
|
|
@@ -172,9 +245,9 @@ module idma_distributed_midend #( | |
| automatic string str; | ||
| if (rst_ni && valid_i && ready_o) begin | ||
| str = "[idma_distributed_midend] Got request\n"; | ||
| str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size %d\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes); | ||
| str = $sformatf("%sRequest in: From: 0x%8x To: 0x%8x with size 0x%8x (%d)\n", str, burst_req_i.src, burst_req_i.dst, burst_req_i.num_bytes, burst_req_i.num_bytes); | ||
| for (int i = 0; i < NoMstPorts; i++) begin | ||
| str = $sformatf("%sOut %6d: From: 0x%8x To: 0x%8x with size %d\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes); | ||
| str = $sformatf("%sRequest Out %6d: From: 0x%8x To: 0x%8x with size 0x%8x (%d)\n", str, i, burst_req_o[i].src, burst_req_o[i].dst, burst_req_o[i].num_bytes, burst_req_o[i].num_bytes); | ||
| end | ||
| f = $fopen("dma.log", "a"); | ||
| $fwrite(f, str); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please move this to the bottom of the list.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved to the end of the list