Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
- Aligned `core_lib::math::u256` user docs with unified LE stack limb ordering (`a0/b0` on top), removing conflicting `[b7..b0, a7..a0]` notation ([#3066](https://github.com/0xMiden/miden-vm/pull/3066)).
- Made all internal `core::math` procedures natively little-endian ([#3084](https://github.com/0xMiden/miden-vm/pull/3084)).
- [BREAKING] Updated the Miden crypto stack to `miden-crypto` v0.25, and switched SMT leaf hashing to use Poseidon2 domain separation so masm-side leaf digests match `SmtLeaf::hash()` ([#3095](https://github.com/0xMiden/miden-vm/pull/3095)).
- Improved performances of auxiliary trace generation ([#3119](https://github.com/0xMiden/miden-vm/pull/3119)).
- [BREAKING] Reject post-last operation-indexed decorators in block assembly and serialized MAST forests; use `after_exit` for decorators that run after a block exits ([#3114](https://github.com/0xMiden/miden-vm/pull/3114)).
- [BREAKING] Removed `Continuation::AfterExitDecoratorsBasicBlock`. New MAST merges operation-indexed decorators at the post-last-op sentinel index into `after_exit` at build time; execution uses `AfterExitDecorators` only, with legacy forests still supported ([#2633](https://github.com/0xMiden/miden-vm/issues/2633)).

Expand Down
22 changes: 21 additions & 1 deletion air/src/lookup/aux_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ use super::{Challenges, LookupAir, ProverLookupBuilder, prover::build_lookup_fra
/// [`crate::trace::main_trace::ROW_MAJOR_CHUNK_SIZE`] so we stay consistent with the
/// repo's row-major tuning: ~512 rows × avg shape ~3 ≈ 1.5 K fractions per chunk and
/// ~24 KiB of chunk-local scratch, comfortably L1-resident on any modern x86/arm core.
const ACCUMULATE_ROWS_PER_CHUNK: usize = 512;
pub(crate) const ACCUMULATE_ROWS_PER_CHUNK: usize = 512;

// TOP-LEVEL DRIVER
// ================================================================================================
Expand Down Expand Up @@ -174,6 +174,26 @@ where
}
}

#[cfg(feature = "concurrent")]
/// Build a `LookupFractions` from already-populated `fractions` and `counts` buffers.
pub(super) fn from_parts(
shape: Vec<usize>,
num_rows: usize,
fractions: Vec<(F, EF)>,
counts: Vec<usize>,
) -> Self {
let num_cols = shape.len();
debug_assert_eq!(counts.len(), num_rows * num_cols);

Self {
fractions,
counts,
shape,
num_rows,
num_cols,
}
}

/// Number of permutation columns.
pub fn num_columns(&self) -> usize {
self.num_cols
Expand Down
65 changes: 50 additions & 15 deletions air/src/lookup/prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,32 +158,67 @@ pub fn build_lookup_fractions<A, F, EF>(
where
F: Field,
EF: ExtensionField<F>,
A: Sync,
for<'a> A: LookupAir<ProverLookupBuilder<'a, F, EF>>,
{
let num_rows = main_trace.height();
let width = main_trace.width();
let flat: &[F] = main_trace.values.borrow();

let shape = air.column_shape().to_vec();
let mut fractions = LookupFractions::from_shape(shape, num_rows);

// Per-row periodic slice, filled in place each row — no per-iteration allocation.
let mut periodic_row: Vec<F> = vec![F::ZERO; periodic_columns.len()];

for r in 0..num_rows {
let curr = &flat[r * width..(r + 1) * width];
let nxt_idx = (r + 1) % num_rows;
let next = &flat[nxt_idx * width..(nxt_idx + 1) * width];
let window = RowWindow::from_two_rows(curr, next);
// Fill one chunk of rows into a fresh per-chunk `LookupFractions`.
let process_chunk = |row_lo: usize, row_hi: usize| -> LookupFractions<F, EF> {
let mut chunk = LookupFractions::from_shape(shape.clone(), row_hi - row_lo);
let mut periodic_row: Vec<F> = vec![F::ZERO; periodic_columns.len()];
for r in row_lo..row_hi {
let curr = &flat[r * width..(r + 1) * width];
let nxt_idx = (r + 1) % num_rows;
let next = &flat[nxt_idx * width..(nxt_idx + 1) * width];
let window = RowWindow::from_two_rows(curr, next);
for (i, col) in periodic_columns.iter().enumerate() {
periodic_row[i] = col[r % col.len()];
}
let mut lb =
ProverLookupBuilder::new(window, &periodic_row, challenges, air, &mut chunk);
air.eval(&mut lb);
}
chunk
};

for (i, col) in periodic_columns.iter().enumerate() {
periodic_row[i] = col[r % col.len()];
#[cfg(not(feature = "concurrent"))]
let fractions = process_chunk(0, num_rows);

// Concatenation after parallel processing preserves global row order because chunks
// tile `0..num_rows` contiguously and each chunk's `fractions` / `counts` are
// row-major within the chunk.
#[cfg(feature = "concurrent")]
let fractions = {
use miden_crypto::parallel::*;

let num_cols = shape.len();
let rows_per_chunk = crate::lookup::aux_builder::ACCUMULATE_ROWS_PER_CHUNK;
let num_chunks = num_rows.div_ceil(rows_per_chunk);

let chunks: Vec<LookupFractions<F, EF>> = (0..num_chunks)
.into_par_iter()
.map(|chunk_idx| {
let row_lo = chunk_idx * rows_per_chunk;
let row_hi = (row_lo + rows_per_chunk).min(num_rows);
process_chunk(row_lo, row_hi)
})
.collect();

let total_fractions: usize = chunks.iter().map(|c| c.fractions.len()).sum();
let mut fractions_vec: Vec<(F, EF)> = Vec::with_capacity(total_fractions);
let mut counts_vec: Vec<usize> = Vec::with_capacity(num_rows * num_cols);
for chunk in chunks {
fractions_vec.extend(chunk.fractions);
counts_vec.extend(chunk.counts);
}

let mut lb =
ProverLookupBuilder::new(window, &periodic_row, challenges, air, &mut fractions);
air.eval(&mut lb);
}
LookupFractions::from_parts(shape, num_rows, fractions_vec, counts_vec)
};

debug_assert_eq!(
fractions.counts().len(),
Expand Down
2 changes: 1 addition & 1 deletion processor/src/trace/chiplets/memory/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ impl Memory {
};

let (delta_hi, delta_lo) = split_u32_into_u16(delta);
range.add_range_checks(row, &[delta_lo, delta_hi]);
range.add_range_checks(&[delta_lo, delta_hi]);

// word index decomposition range checks: prove addr is a valid 32-bit value
// by checking w0, w1, and 4*w1 are all in [0, 2^16).
Expand Down
4 changes: 2 additions & 2 deletions processor/src/trace/parallel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,8 @@ fn initialize_range_checker(
let mut range_checker = RangeChecker::new();

// Add all u32 range checks recorded during execution
for (clk, values) in range_checker_replay.into_iter() {
range_checker.add_range_checks(clk, &values);
for (_clk, values) in range_checker_replay.into_iter() {
range_checker.add_range_checks(&values);
}

// Add all memory-related range checks
Expand Down
20 changes: 2 additions & 18 deletions processor/src/trace/range/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ use core::mem::MaybeUninit;

use miden_air::trace::RANGE_CHECK_TRACE_WIDTH;

use super::RowIndex;
use crate::{
Felt, ZERO,
utils::{assume_init_vec, uninit_vector},
Expand Down Expand Up @@ -50,10 +49,6 @@ pub struct RangeCheckTrace {
pub struct RangeChecker {
/// Tracks lookup count for each checked value.
lookups: BTreeMap<u16, usize>,
/// Range check lookups performed by all user operations, grouped and sorted by clock cycle.
/// Each cycle is mapped to a vector of the range checks requested at that cycle, which can
/// come from the stack, memory, or both.
cycle_lookups: BTreeMap<RowIndex, Vec<u16>>,
}

impl RangeChecker {
Expand All @@ -66,7 +61,7 @@ impl RangeChecker {
// range checker table are initialized. this simplifies trace table building later on.
lookups.insert(0, 0);
lookups.insert(u16::MAX, 0);
Self { lookups, cycle_lookups: BTreeMap::new() }
Self { lookups }
}

// TRACE MUTATORS
Expand All @@ -78,7 +73,7 @@ impl RangeChecker {
}

/// Adds range check lookups from the stack or memory to this [RangeChecker] instance.
pub fn add_range_checks(&mut self, clk: RowIndex, values: &[u16]) {
pub fn add_range_checks(&mut self, values: &[u16]) {
// range checks requests only come from memory or from the stack, which always request 2 or
// 4 lookups respectively.
debug_assert!(values.len() == 2 || values.len() == 4);
Expand All @@ -87,17 +82,6 @@ impl RangeChecker {
// add the specified value to the trace of this range checker's lookups.
self.add_value(*value);
}

// track the range check requests at each cycle
// TODO: optimize this to use a struct instead of vectors, e.g. (#2793):
// struct MemoryLookupValues {
// num_lookups: u8,
// lookup_values: [u16; 6],
// }
self.cycle_lookups
.entry(clk)
.and_modify(|entry| entry.extend_from_slice(values))
.or_insert_with(|| values.to_vec());
}

// EXECUTION TRACE GENERATION (INTERNAL)
Expand Down
Loading